def collect_data(): env = ContinuousCartPoleEnv() EP_NUM = 1000 data_set = [] for ep in range(EP_NUM): ep_loss = 0 state = env.reset() for t in range(200): state = torch.from_numpy(state).float().to(device) action = ppo.choose_action(state.cpu().data.numpy(), True) with torch.no_grad(): ca1 = model_1(state) ca2 = model_2(state) control_action = ca1 * action[0] + ca2 * action[1] control_action = np.clip(control_action.cpu().data.numpy(), -1, 1) next_state, reward, done, _ = env.step(control_action) data_set.append([ state.cpu().data.numpy()[0], state.cpu().data.numpy()[1], state.cpu().data.numpy()[2], state.cpu().data.numpy()[3], control_action[0] ]) state = next_state if done: break print(t) return np.array(data_set)
def train_switcher_DDQN(): mkdir('./switch') env = ContinuousCartPoleEnv() model = DQN(4, 2).to(device) target_model = DQN(4, 2).to(device) optimizer = optim.Adam(model.parameters()) EP_NUM = 2001 frame_idx = 0 fuel_list = [] ep_reward = deque(maxlen=100) for ep in range(EP_NUM): state = env.reset() ep_r = 0 for t in range(200): state = torch.from_numpy(state).float().to(device) epsilon = epsilon_by_frame(frame_idx) action = model.act(state, epsilon) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy() elif action == 1: control_action = model_2(state).cpu().data.numpy() else: assert False control_action = 0 next_state, _, done, _ = env.step(control_action) reward = 5 reward -= weight * abs(control_action) if done and t != 199: reward -= 50 replay_buffer.push(state.cpu().numpy(), action, reward, next_state, done) fuel_list.append(abs(control_action)) state = next_state ep_r += reward frame_idx += 1 if len(replay_buffer) > batch_size: loss = compute_td_loss(model, target_model, batch_size, optimizer) if frame_idx % 100 == 0: update_target(model, target_model) if done: break ep_reward.append(ep_r) print('epoch:', ep, 'reward:', ep_r, 'average reward:', np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]), 'epsilon:', epsilon, len(replay_buffer)) if ep >= 100 and ep % 100 == 0: torch.save(model.state_dict(), './switch/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
def train(): env = ContinuousCartPoleEnv() state_dim = 4 action_dim = 2 # reproducible # env.seed(RANDOMSEED) np.random.seed(RANDOMSEED) torch.manual_seed(RANDOMSEED) ppo = PPO(state_dim, action_dim, method=METHOD) global all_ep_r, update_plot, stop_plot all_ep_r = [] for ep in range(EP_MAX): s = env.reset() ep_r = 0 t0 = time.time() for t in range(EP_LEN): if RENDER: env.render() a = ppo.choose_action(s) u = np.clip(gene_u(s, a, model_1, model_2), -1, 1) s_, _, done, _ = env.step(u) # print(s, a, s_, r, done) # assert False r = 5 r -= WEIGHT * abs(u[0]) # r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1])) if done and t != 199: r -= 50 ppo.store_transition( s, a, r ) # useful for pendulum since the nets are very small, normalization make it easier to learn s = s_ ep_r += r # update ppo if len(ppo.state_buffer) == BATCH_SIZE: ppo.finish_path(s_, done) ppo.update() # if done: # break ppo.finish_path(s_, done) print( 'Episode: {}/{} | Episode Reward: {:.4f} | Running Time: {:.4f}'. format(ep + 1, EP_MAX, ep_r, time.time() - t0)) if ep == 0: all_ep_r.append(ep_r) else: all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1) if PLOT_RESULT: update_plot.set() if (ep + 1) % 500 == 0 and ep >= 3000: ppo.save_model(path='ppo', ep=ep, weight=WEIGHT) if PLOT_RESULT: stop_plot.set() env.close()
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] model = Model(act_dim) algorithm = parl.algorithms.DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) while rpm.size() < MEMORY_WARMUP_SIZE: run_train_episode(env, agent, rpm) episode = 0 while episode < 30000: for i in range(50): train_reward = run_train_episode(env, agent, rpm) episode += 1 # logger.info('Episode: {} Reward: {}'.format(episode, train_reward)) evaluate_reward = run_evaluate_episode(env, agent, False) logger.info('Episode {}, Evaluate reward: {}'.format( episode, evaluate_reward)) if (evaluate_reward == 200): break agent.save('./model_dir')
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm) episode = 0 is_render = False while episode < TRAIN_EPISODE: for i in range(50): total_reward = run_episode(agent, env, rpm) episode += 1 # eval_reward = evaluate(env, agent, render=False) eval_reward = evaluate(env, agent, is_render) logger.info('episode:{} Test reward:{}'.format( episode, eval_reward))
def continuos_cartpole_train(n_episodes=2000, max_t=700): env = ContinuousCartPoleEnv() scores_deque = deque(maxlen=100) scores = [] max_score = -np.Inf obs_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] model = Model(state_size=obs_size, action_size=action_size) target_model = Model(state_size=obs_size, action_size=action_size) alg = DDPG(model, target_model, gamma=0.99, tau=1e-3, actor_lr=1e-4, critic_lr=3e-4) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10) for i_episode in range(1, n_episodes + 1): state = env.reset() #agent.reset() score = 0 for t in range(max_t): action = agent.act(state) next_state, reward, done, _ = env.step(action) agent.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_deque.append(score) scores.append(score) #print('\rEpisode {}\tAverage Score: {:.2f}\tScore: {:.2f}'.format(i_episode, np.mean(scores_deque), score), end="") if i_episode % 100 == 0: print('\rEpisode {}\tAverage Score: {:.2f}'.format( i_episode, np.mean(scores_deque))) if np.mean(scores_deque) > 200: torch.save(agent.alg.model.actor_model.state_dict(), 'cart_pole_actor.pth') torch.save(agent.alg.model.critic_model.state_dict(), 'cart_pole_critic.pth') break return scores
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] #obs_dim += 1 # add 1 to obs dim for time step feature logger.info('observation_dim {}, action_dim {}'.format(obs_dim, act_dim)) scaler = Scaler(obs_dim) model = Model(obs_dim, act_dim) alg = parl.algorithms.PPO( model, act_dim=act_dim, policy_lr=model.policy_lr, value_lr=model.value_lr) agent = Agent(alg, obs_dim, act_dim, args.kl_targ, loss_type=args.loss_type) # run a few episodes to initialize scaler collect_trajectories(env, agent, scaler, episodes=5) test_flag = 0 total_steps = 0 while total_steps < args.train_total_steps: trajectories = collect_trajectories( env, agent, scaler, episodes=args.episodes_per_batch) total_steps += sum([t['obs'].shape[0] for t in trajectories]) total_train_rewards = sum([np.sum(t['rewards']) for t in trajectories]) train_obs, train_actions, train_advantages, train_discount_sum_rewards = build_train_data( trajectories, agent) policy_loss, kl = agent.policy_learn(train_obs, train_actions, train_advantages) value_loss = agent.value_learn(train_obs, train_discount_sum_rewards) logger.info( 'Steps {}, Train reward: {}, Policy loss: {}, KL: {}, Value loss: {}' .format(total_steps, total_train_rewards / args.episodes_per_batch, policy_loss, kl, value_loss)) if total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag: test_flag += 1 eval_reward = run_evaluate_episode(env, agent, scaler, render=True) logger.info('Steps {}, Evaluate reward: {}'.format( total_steps, eval_reward))
def test(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] model = Model(act_dim) algorithm = parl.algorithms.DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) if os.path.exists('./model_dir'): agent.restore('./model_dir') eval_reward = run_evaluate_episode(env, agent, True) logger.info('test_reward:{}'.format(eval_reward))
def main(): env = ContinuousCartPoleEnv() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # 使用PARL框架创建agent model = Model(act_dim) algorithm = DDPG(model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR) agent = Agent(algorithm, obs_dim, act_dim) # 加载模型 if os.path.exists('./model.ckpt'): agent.restore('./model.ckpt') eval_reward = evaluate(env, agent, render=True) print("eval_reward=", eval_reward) exit() # 创建经验池 rpm = ReplayMemory(MEMORY_SIZE) # 往经验池中预存数据 while len(rpm) < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm) episode = 0 while episode < TRAIN_EPISODE: print("start training, episode=", episode) for i in range(50): total_reward = run_episode(agent, env, rpm) episode += 1 print("episode=", episode, "total_reward=", total_reward) eval_reward = evaluate(env, agent, render=False) logger.info('episode:{} Test reward:{}'.format( episode, eval_reward)) agent.save('./model.ckpt')
def evaluate(render=True): env = ContinuousCartPoleEnv() obs_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] model = Model(state_size=obs_size, action_size=action_size) target_model = Model(state_size=obs_size, action_size=action_size) alg = DDPG(model, target_model, gamma=0.99, tau=1e-3, actor_lr=1e-4, critic_lr=3e-4) agent = Agent(alg, BUFFER_SIZE, BATCH_SIZE, seed=10) agent.alg.model.actor_model.load_state_dict( torch.load("cart_pole_actor.pth")) agent.alg.model.critic_model.load_state_dict( torch.load("cart_pole_critic.pth")) eval_reward = [] for i in range(10): obs = env.reset() total_reward = 0 steps = 0 while True: action = agent.act(obs) steps += 1 next_obs, reward, done, info = env.step(action) obs = next_obs total_reward += reward if render: env.render() if done: break eval_reward.append(total_reward) return np.mean(eval_reward)
import sys module_path = os.path.abspath(os.path.join('../..')) if module_path not in sys.path: sys.path.append(module_path) from Agent import Agent device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # print("Device: ", device) random_seed = int(sys.argv[2]) print(random_seed) # env = gym.make('MountainCarContinuous-v0') env = ContinuousCartPoleEnv() env.seed(random_seed) # size of each action action_size = env.action_space.shape[0] print('Size of each action:', action_size) # examine the state space state_size = env.observation_space.shape[0] print('Size of state:', state_size) action_low = env.action_space.low print('Action low:', action_low) action_high = env.action_space.high print('Action high: ', action_high)
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None): print(mode) env = ContinuousCartPoleEnv() EP_NUM = 500 if mode == 'switch': model = DQN(4, 2).to(device) model.load_state_dict(torch.load(adapter_name)) if mode == 'individual': Individual.load_state_dict(torch.load(INDI_NAME)) if renew: state_list = [] fuel_list = [] ep_reward = [] trajectory = [] safe = [] unsafe = [] control_action_list = [] for ep in range(EP_NUM): if renew: state = env.reset() state_list.append(state) else: assert len(state_list) == EP_NUM state = env.reset(state=state_list[ep], set_state=True) ep_r = 0 fuel = 0 if ep == 0: trajectory.append(state) for t in range(200): state = torch.from_numpy(state).float().to(device) if mode == 'switch': action = model.act(state, epsilon=0) with torch.no_grad(): if action == 0: control_action = model_1(state).cpu().data.numpy() elif action == 1: control_action = model_2(state).cpu().data.numpy() else: assert False control_action = 0 elif mode == 'ppo': action = ppo.choose_action(state.cpu().data.numpy(), True) ca1 = model_1(state).cpu().data.numpy()[0] ca2 = model_2(state).cpu().data.numpy()[0] control_action = np.array([action[0] * ca1 + action[1] * ca2]) control_action = np.clip(control_action, -1, 1) if ep == 0: print(t, state, control_action, action, ca1, ca2) elif mode == 'd1': control_action = model_1(state).cpu().data.numpy() elif mode == 'd2': control_action = model_2(state).cpu().data.numpy() elif mode == 'individual': if ATTACK: if t % 15 == 0: delta = fgsm(Individual, state) # ele1 = np.random.uniform(low=-SCALE1, high=SCALE1, size=1)[0] # ele2 = np.random.uniform(low=-SCALE2, high=SCALE2, size=1)[0] # delta = torch.from_numpy(np.array([ele1, 0, ele2, 0])).float().to(device) control_action = Individual(state + delta).cpu().data.numpy() else: control_action = Individual(state).cpu().data.numpy() control_action = np.clip(control_action, -1, 1) next_state, reward, done, _ = env.step(control_action) fuel += abs(control_action) state = next_state if ep == 99: trajectory.append(state) control_action_list.append(control_action) ep_r += reward if done: break ep_reward.append(ep_r) if t == 199: fuel_list.append(fuel) safe.append(state_list[ep]) else: print(ep, state_list[ep]) unsafe.append(state_list[ep]) safe = np.array(safe) unsafe = np.array(unsafe) np.save('./plot/' + mode + '_safe.npy', safe) np.save('./plot/' + mode + '_unsafe.npy', unsafe) return ep_reward, np.array(fuel_list), state_list, np.array( control_action_list)
if __name__ == '__main__': # if args.train: # thread = threading.Thread(target=train) # thread.daemon = True # thread.start() # if PLOT_RESULT: # drawer = Drawer() # drawer.plot() # drawer.save() # thread.join() train() assert False # test env = ContinuousCartPoleEnv() state_dim = 2 action_dim = 2 ppo = PPO(state_dim, action_dim, method=METHOD) ppo.load_model() mean_epoch_reward = 0 for _ in range(TEST_EP): state = env.reset() for i in range(EP_LEN): if RENDER: env.render() action = ppo.choose_action(state, True) u = gene_u(state, action, model_1, model_2) next_state, reward, done = env.step(u) mean_epoch_reward += reward state = next_state