def collect_data(): env = Osillator() EP_NUM = 1500 data_set = [] for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = ppo.choose_action(state.cpu().data.numpy(), False) with torch.no_grad(): ca1 = model_1(state) ca2 = model_2(state) control_action = ca1 * action[0] + ca2 * action[1] next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) data_set.append([ state.cpu().data.numpy()[0], state.cpu().data.numpy()[1], np.clip(control_action.cpu().data.numpy()[0], -1, 1) ]) state = next_state if done: break print(t) return np.array(data_set)
def train_weight_adapter_DDPG(EP_NUM=2000): mkdir('./adapter_soft') env = Osillator() scores_deque = deque(maxlen=100) scores = [] for ep in range(EP_NUM): state = env.reset() agent.reset() score = 0 for t in range(200): action = agent.act(state) ca1 = model_1(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0] ca2 = model_2(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0] control_action = action[0]*ca1 + action[1]*ca2 next_state, _, done = env.step(control_action, smoothness=0.5) reward = 5 reward -= weight * abs(control_action) * 20 reward -= 1 / weight * (abs(next_state[0]) + abs(next_state[1])) if done and t < 95: reward -= 100 agent.step(state, action, reward, next_state, done, t) score += reward state = next_state if done: break scores_deque.append(score) scores.append(score) score_average = np.mean(scores_deque) if ep % 1 == 0: print('\rEpisode {}, Average Score: {:.2f}, Current Score:{:.2f}, Max: {:.2f}, Min: {:.2f}, Epsilon: {:.2f}, Momery:{:.1f}'\ .format(ep, score_average, scores[-1], np.max(scores), np.min(scores), agent.epsilon, len(agent.memory)), end="\n") if ep > 0 and ep % 100 == 0: torch.save(agent.actor_local.state_dict(), './adapter_soft/adapter_'+str(ep)+'_'+str(weight)+ '.pth')
def collect_data(adapter_name, INDI_NAME): assert EXP1 == True env = Osillator() model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) EP_NUM = 1500 data_set = [] for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = model(state).cpu().data.numpy() with torch.no_grad(): ca1 = model_1(state) ca2 = model_2(state) control_action = ca1 * action[0] + ca2 * action[1] next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) data_set.append([ state.cpu().data.numpy()[0], state.cpu().data.numpy()[1], control_action.cpu().data.numpy()[0] ]) state = next_state if done: break print(ep_loss, t) return np.array(data_set)
def train_switcher_DDQN(): mkdir('./adapter_ab') env = Osillator() model = DQN(2, 2).to(device) target_model = DQN(2, 2).to(device) optimizer = optim.Adam(model.parameters()) EP_NUM = 2001 frame_idx = 0 fuel_list = [] ep_reward = deque(maxlen=100) for ep in range(EP_NUM): state = env.reset() ep_r = 0 for t in range(200): state = torch.from_numpy(state).float().to(device) epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 next_state, _, done = env.step(control_action) reward = 2 reward -= weight * abs(control_action) * 20 if done and t < 190: reward -= 100 replay_buffer.push(state.cpu().numpy(), action, reward, next_state, done) fuel_list.append(abs(control_action) * 20) state = next_state ep_r += reward frame_idx += 1 if len(replay_buffer) > batch_size: loss = compute_td_loss(model, target_model, batch_size, optimizer) if frame_idx % 100 == 0: update_target(model, target_model) if done: break ep_reward.append(ep_r) print('epoch:', ep, 'reward:', ep_r, 'average reward:', np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]), 'epsilon:', epsilon, len(replay_buffer)) if ep >= 100 and ep % 100 == 0: torch.save( model.state_dict(), './adapter_ab/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
def train(): env = Osillator() state_dim = 2 action_dim = 2 # reproducible # env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) torch.manual_seed(RANDOMSEED) ppo = PPO(state_dim, action_dim, method=METHOD) global all_ep_r, update_plot, stop_plot all_ep_r = [] for ep in range(EP_MAX): s = env.reset() ep_r = 0 t0 = time.time() for t in range(EP_LEN): if RENDER: env.render() a = ppo.choose_action(s) u = gene_u(s, a, model_1, model_2) s_, _, done = env.step(u) # print(s, a, s_, r, done) # assert False r = 10 r -= WEIGHT * abs(np.clip(u, -1, 1)) * 20 r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1])) if done and t < 95: r -= 100 ppo.store_transition( s, a, r ) # useful for pendulum since the nets are very small, normalization make it easier to learn s = s_ ep_r += r # update ppo if len(ppo.state_buffer) == BATCH_SIZE: ppo.finish_path(s_, done) ppo.update() # if done: # break ppo.finish_path(s_, done) print( 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'. format(ep + 1, EP_MAX, ep_r, time.time() - t0)) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) if PLOT_RESULT: update_plot.set() if (ep + 1) % 500 == 0 and ep >= 3000: ppo.save_model(path='ppo', ep=ep, weight=WEIGHT) if PLOT_RESULT: stop_plot.set() env.close()
def distill(adapter_name, INDI_NAME): optimizer = torch.optim.SGD(Individual.parameters(), lr=0.001, momentum=0.9) loss_func = torch.nn.MSELoss() env = Osillator() model = DQN(2, 2).to(device) EP_NUM = 500 model.load_state_dict(torch.load(adapter_name)) for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state) elif action == 1: control_action = model_2(state) control_action.requires_grad = False prediction = Individual(state) loss = loss_func(prediction, control_action) optimizer.zero_grad() loss.backward() optimizer.step() ep_loss += loss.item() next_state, reward, done = env.step( control_action.cpu().data.numpy()[0]) state = next_state if done: break print(ep_loss) torch.save(Individual.state_dict(), INDI_NAME)
# this file is to train and test NN controller # Invariant of Bernstein polynomial approximation is also shown here whose computation is referred to # files in ./mat folder, where value-based method and polySOS are used import gym.spaces import random import torch import numpy as np from collections import deque import matplotlib.pyplot as plt from interval import Interval from env import Osillator import scipy.io as io from scipy.interpolate import interp2d env = Osillator() import os import sys module_path = os.path.abspath(os.path.join('../..')) if module_path not in sys.path: sys.path.append(module_path) from Agent import Agent def mkdir(path): folder = os.path.exists(path) if not folder: os.makedirs(path) def save_model(i_episode, score_average): print("Model Save...") if score_average > 300:
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() EP_NUM = 500 if mode == 'switch': model = DQN(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'weight': model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] safe = [] unsafe = [] control_action_list = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 elif mode == 'ppo': action = ppo.choose_action(state.cpu().data.numpy(), True) ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = action[0]*ca1 + action[1]*ca2 if ep == 0: print(t, state, control_action, action, ca1, ca2) elif mode == 'average': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = (ca1 + ca2)/2 elif mode == 'planning': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = plan(state, ca1, ca2) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] if ep == 0: print(state, control_action) elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': if ATTACK: delta, original = fgsm(Individual, state) # delta = torch.from_numpy(np.random.uniform(low=-SCALE, high=SCALE, size=state.shape)).float().to(device) control_action = Individual(state+delta).cpu().data.numpy()[0] else: control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) control_action = np.clip(control_action, -1, 1) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) control_action_list.append(control_action) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) safe.append(state_list[ep]) else: print(ep, state_list[ep]) unsafe.append(state_list[ep]) safe = np.array(safe) unsafe = np.array(unsafe) np.save('./plot/'+mode+'_safe.npy', safe) np.save('./plot/'+mode+'_unsafe.npy', unsafe) return ep_reward, np.array(fuel_list), state_list, np.array(control_action_list)
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() model = DQN(2, 2).to(device) EP_NUM = 500 if mode == 'switch': model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): state = torch.from_numpy(state).float().to(device) # flag = where_inv(state.cpu().numpy()) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 if ep == 0: print(t, state, action, control_action * 20) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) else: print(ep, state_list[ep]) if ep == 0: trajectory = np.array(trajectory) # plt.figure() plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode) plt.legend() plt.savefig('trajectory.png') return ep_reward, np.array(fuel_list), state_list
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = Osillator() EP_NUM = 1 if mode == 'switch': model = DQN(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'weight': model = Weight_adapter(2, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] safe = [] unsafe = [] control_action_list = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state_list[ep][0], state_list[ep][1]) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(env.max_iteration): # attack happens here # state += np.random.uniform(low=-0.35, high=0.35, size=state.shape) state = torch.from_numpy(state).float().to(device) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy()[0] elif action == 1: control_action = model_2(state).cpu().data.numpy()[0] else: assert False control_action = 0 elif mode == 'weight': action = model(state).cpu().data.numpy() ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = action[0] * ca1 + action[1] * ca2 if ep == 0: print(t, state, control_action, action, ca1, ca2) elif mode == 'average': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = (ca1 + ca2) / 2 elif mode == 'planning': ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = plan(state, ca1, ca2) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy()[0] elif mode == 'd2': control_action = model_2(state).cpu().data.numpy()[0] elif mode == 'individual': # delta, original = fgsm(Individual, state) # if ep == 0: # print(delta, original) # control_action = Individual(state+delta).cpu().data.numpy()[0] control_action = Individual(state).cpu().data.numpy()[0] next_state, reward, done = env.step(control_action) control_action = np.clip(control_action, -1, 1) fuel += abs(control_action) * 20 state = next_state if ep == 0: trajectory.append(state) control_action_list.append(control_action) ep_r += reward if done: break ep_reward.append(ep_r) if t >= 95: fuel_list.append(fuel) safe.append(state_list[ep]) else: print(ep, state_list[ep]) unsafe.append(state_list[ep]) if ep == 0: trajectory = np.array(trajectory) # plt.figure() plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode) plt.legend() plt.savefig('trajectory.png') # safe = np.array(safe) # unsafe = np.array(unsafe) # plt.figure() # plt.scatter(safe[:, 0], safe[:, 1], c='green') # plt.scatter(unsafe[:, 0], unsafe[:, 1], c='red') # plt.savefig('./safe_sample_plot/'+ mode +'.png') return ep_reward, np.array(fuel_list), state_list, np.array( control_action_list)
if __name__ == '__main__': # if args.train: # thread = threading.Thread(target=train) # thread.daemon = True # thread.start() # if PLOT_RESULT: # drawer = Drawer() # drawer.plot() # drawer.save() # thread.join() train() assert False # test env = Osillator() state_dim = 2 action_dim = 2 ppo = PPO(state_dim, action_dim, method=METHOD) ppo.load_model() mean_epoch_reward = 0 for _ in range(TEST_EP): state = env.reset() for i in range(EP_LEN): if RENDER: env.render() action = ppo.choose_action(state, True) u = gene_u(state, action, model_1, model_2) next_state, reward, done = env.step(u) mean_epoch_reward += reward state = next_state