def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [] for i in range(num_agent): policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * (num_agent - 1)), output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) eval_policy = ArgmaxDiscretePolicy(policy) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) policy_n.append(policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACDiscreteTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def step(self, action): self.state = self.state + np.asarray(action) env = CartPoleEnv(self.state[0], self.state[1], self.state[2], self.state[3], self.state[4]) episode_count = len(self.action_record) model_diff = 0 for i in range(episode_count): ob = env.reset() traj_state = [] for j in range(len(self.action_record[i])): # The traj that done is better tricky here action = self.action_record[i][j] ob, reward, done, _ = env.step(action) traj_state.append(ob) if done: break if not done: model_diff = model_diff + 1 # penalty for not done model_diff = model_diff + self._traj_diff(np.asarray(traj_state), self.state_record[i]) reward = -model_diff - self.status self.status = -model_diff done = False return np.array(self.state), reward, done, {}
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy_n, eval_policy_n, qf1_n, target_qf1_n, qf2_n, target_qf2_n = \ [], [], [], [], [], [] for i in range(num_agent): policy = TanhGaussianPolicy(obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs']) eval_policy = MakeDeterministic(policy) qf1 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf1) policy_n.append(policy) eval_policy_n.append(eval_policy) qf1_n.append(qf1) target_qf1_n.append(target_qf1) qf2_n.append(qf2) target_qf2_n.append(target_qf2) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MASACTrainer(env=expl_env, qf1_n=qf1_n, target_qf1_n=target_qf1_n, qf2_n=qf2_n, target_qf2_n=target_qf2_n, policy_n=policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def looping(qt=None, epsilon=config.epsilon, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 end = False epsilon = epsilon * 0.9999 while not end: current_state = cart.state action = choose_action(current_state, qt, epsilon) new_state, reward, end, _ = cart.step(action) if end: reward = -10 update_qt_new(qt, current_state, reward, action, new_state) turn += 1 if (visu): cart.render() data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", epsilon) if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
def __init__(self): self.plot_data = PlotData() self.env = CartPoleEnv() self.main_net = DQN() self.target_net = deepcopy(self.main_net) self.epsilon = config.epsilon self.eps_decay = 0.995 self.visu = False self.visu_update = False#300 self.visu_window = 5 self.memory = Memory(memory_size = 30) self.batch_size = 5
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) qf1 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf1 = copy.deepcopy(qf1) qf2 = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf2 = copy.deepcopy(qf2) eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) qf_criterion = nn.MSELoss() trainer = SACDiscreteTrainer(env=eval_env, policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, qf_criterion=qf_criterion, **variant['trainer_kwargs']) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv from rlkit.envs.ma_wrappers import MAProbDiscreteEnv expl_env = MAProbDiscreteEnv(CartPoleEnv(mode=4)) eval_env = MAProbDiscreteEnv(CartPoleEnv(mode=4)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf_n, policy_n, target_qf_n, target_policy_n, exploration_policy_n = \ [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp(input_size=(obs_dim * num_agent + action_dim * num_agent), output_size=1, **variant['qf_kwargs']) policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) exploration_policy = policy qf_n.append(qf) policy_n.append(policy) target_qf_n.append(target_qf) target_policy_n.append(target_policy) exploration_policy_n.append(exploration_policy) eval_path_collector = MAMdpPathCollector(eval_env, policy_n) expl_path_collector = MAMdpPathCollector(expl_env, exploration_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = MADDPGTrainer(qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv from rlkit.envs.wrappers import ProbDiscreteEnv expl_env = ProbDiscreteEnv(CartPoleEnv(mode=2)) eval_env = ProbDiscreteEnv(CartPoleEnv(mode=2)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size # import gym # from rlkit.envs.wrappers import ProbDiscreteEnv # expl_env = ProbDiscreteEnv(gym.make('CartPole-v0')) # eval_env = ProbDiscreteEnv(gym.make('CartPole-v0')) # obs_dim = eval_env.observation_space.low.size # action_dim = eval_env.action_space.low.size qf = FlattenMlp(input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs']) policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) # remove this since need action to be a prob # exploration_policy = PolicyWrappedWithExplorationStrategy( # exploration_strategy=OUStrategy(action_space=expl_env.action_space), # policy=policy, # ) exploration_policy = policy expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer(qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def __init__(self): self.plot_data = PlotData() self.cart = CartPoleEnv() self.cart.reset() self.predi_net = DQN() self.updat_net = deepcopy(self.predi_net) self.turn = 0 self.epidode = 0 self.epsilon = config.epsilon self.eps_decay = 0.99 self.visu = False self.visu_update = False #300 self.visu_window = 5 self.consecutive_wins = 0 self.best_consecutive_wins = 0 self.last_save = 0 self.memory = []
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n # import gym # expl_env = gym.make('CartPole-v0') # eval_env = gym.make('CartPole-v0') # obs_dim = eval_env.observation_space.low.size # action_dim = eval_env.action_space.n policy = SoftmaxMlpPolicy(input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs']) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = ArgmaxDiscretePolicy(policy, use_preactivation=True) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = VPGTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def resample_task(): task_list = [ CartPoleEnv(np.random.uniform(L_MIN, L_MAX)) for task in range(TASK_NUMS) ] task_lengths = [task.length for task in task_list] print(("task length:", task_lengths)) [task.reset() for task in task_list] return task_list
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=2) eval_env = CartPoleEnv(mode=2) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf = Mlp(input_size=obs_dim, output_size=action_dim, **variant['qf_kwargs']) target_qf = copy.deepcopy(qf) eval_policy = ArgmaxDiscretePolicy(qf) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space, variant['epsilon']), eval_policy, ) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) replay_buffer = PrioritizedReplayBuffer( variant['replay_buffer_size'], expl_env, ) qf_criterion = nn.MSELoss() trainer = DQNTrainer(qf=qf, target_qf=target_qf, qf_criterion=qf_criterion, replay_buffer=replay_buffer, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) eval_env = NormalizedBoxEnv(CartPoleEnv(mode=0)) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size qf = FlattenMlp( input_size=obs_dim + action_dim, output_size=1, **variant['qf_kwargs'] ) policy = TanhMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_policy = copy.deepcopy(policy) eval_path_collector = MdpPathCollector(eval_env, policy) exploration_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=OUStrategy(action_space=expl_env.action_space), policy=policy, ) expl_path_collector = MdpPathCollector(expl_env, exploration_policy) replay_buffer = EnvReplayBuffer(variant['replay_buffer_size'], expl_env) trainer = DDPGTrainer( qf=qf, target_qf=target_qf, policy=policy, target_policy=target_policy, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=0) eval_env = CartPoleEnv(mode=0) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, **variant['policy_kwargs'], ) vf = Mlp( hidden_sizes=[32, 32], input_size=obs_dim, output_size=1, ) vf_criterion = nn.MSELoss() eval_policy = MakeDeterministic(policy) expl_policy = policy eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, expl_policy, ) trainer = PPOTrainer(policy=policy, value_function=vf, vf_criterion=vf_criterion, **variant['trainer_kwargs']) algorithm = TorchOnlineRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
def loop(qt=None, epsilon=1, visu=False): plt.ion() cart = CartPoleEnv() data = [] data_rm = [] config.epsilon = epsilon if (qt is None): qt = initialize_Qtable() for episode in range(config.episodes): cart.reset() turn = 0 s = cart.state end = False epsilon_tmp = config.epsilon while not end: config.epsilon *= 0.97 if (visu): cart.render() a = choose_action(s, qt) _, _, end, _ = cart.step(a) l_val = bellman_q(s, qt, dummy_cart(s), action=a) # print(l_val) update_qt(qt, s, a, l_val) s = cart.state turn += 1 data.append(turn) data_rm.append(np.mean(data[-100:])) print("Episode: ", episode, "\tTurn:", turn, "\t Epsilon:", config.epsilon) config.epsilon = epsilon_tmp if episode % config.graph_update == 0 and episode != 0: graph(data, data_rm) # if ((episode + 1) % 100 == 0 and input("continue (y/n)" != "y")): # break cart.close() return (data, qt)
def main(): # Define dimensions of the networks meta_value_input_dim = STATE_DIM + TASK_CONFIG_DIM # 7 task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7 # init meta value network with a task config network meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1) task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3) meta_value_network.cuda() task_config_network.cuda() if os.path.exists("meta_value_network_cartpole.pkl"): meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl")) print("load meta value network success") if os.path.exists("task_config_network_cartpole.pkl"): task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl")) print("load task config network success") meta_value_network_optim = torch.optim.Adam(meta_value_network.parameters(),lr=0.001) task_config_network_optim = torch.optim.Adam(task_config_network.parameters(),lr=0.001) # init a task generator for data fetching task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)] [task.reset() for task in task_list] task_lengths = [task.length for task in task_list] print("task length:",task_lengths) for episode in range(EPISODE): # ----------------- Training ------------------ if (episode+1) % 10 ==0 : # renew the tasks task_list = [CartPoleEnv(np.random.uniform(L_MIN,L_MAX)) for task in range(TASK_NUMS)] task_lengths = [task.length for task in task_list] print("task length:",task_lengths) [task.reset() for task in task_list] # fetch pre data samples for task config network # [task_nums,sample_nums,x+y`] actor_network_list = [ActorNetwork(STATE_DIM,40,ACTION_DIM) for i in range(TASK_NUMS)] [actor_network.cuda() for actor_network in actor_network_list] actor_network_optim_list = [torch.optim.Adam(actor_network.parameters(),lr = 0.01) for actor_network in actor_network_list] # sample pre state,action,reward for task config pre_states = [] pre_actions = [] pre_rewards = [] for i in range(TASK_NUMS): states,actions,rewards,_,_ = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS) pre_states.append(states) pre_actions.append(actions) pre_rewards.append(rewards) for step in range(STEP): for i in range(TASK_NUMS): # init task config [1, sample_nums,task_config] task_config size=3 pre_data_samples = torch.cat((pre_states[i][-9:],pre_actions[i][-9:],torch.Tensor(pre_rewards[i])[-9:]),1).unsqueeze(0) task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3] states,actions,rewards,is_done,final_state = roll_out(actor_network_list[i],task_list[i],SAMPLE_NUMS) final_r = 0 if not is_done: value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1) final_r = meta_value_network(value_inputs).cpu().data.numpy()[0] # train actor network actor_network_optim_list[i].zero_grad() states_var = Variable(states).cuda() actions_var = Variable(actions).cuda() task_configs = task_config.repeat(1,len(rewards)).view(-1,3) log_softmax_actions = actor_network_list[i](states_var) vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach() # calculate qs qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda() advantages = qs - vs actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y) actor_network_loss.backward() torch.nn.utils.clip_grad_norm(actor_network_list[i].parameters(),0.5) actor_network_optim_list[i].step() # train value network meta_value_network_optim.zero_grad() target_values = qs values = meta_value_network(torch.cat((states_var,task_configs),1)) criterion = nn.MSELoss() meta_value_network_loss = criterion(values,target_values) meta_value_network_loss.backward() torch.nn.utils.clip_grad_norm(meta_value_network.parameters(),0.5) meta_value_network_optim.step() # train actor network pre_states[i] = states pre_actions[i] = actions pre_rewards[i] = rewards if (step + 1) % 100 == 0: result = 0 test_task = CartPoleEnv(length = task_list[i].length) for test_epi in range(10): state = test_task.reset() for test_step in range(200): softmax_action = torch.exp(actor_network_list[i](Variable(torch.Tensor([state])).cuda())) #print(softmax_action.data) action = np.argmax(softmax_action.cpu().data.numpy()[0]) next_state,reward,done,_ = test_task.step(action) result += reward state = next_state if done: break print("episode:",episode,"task:",i,"step:",step+1,"test result:",result/10.0) if (episode+1) % 10 == 0 : # Save meta value network torch.save(meta_value_network.state_dict(),"meta_value_network_cartpole.pkl") torch.save(task_config_network.state_dict(),"task_config_network_cartpole.pkl") print("save networks for episode:",episode)
def dummy_cart(s, cart=None): if cart == None: cart = CartPoleEnv() cart.reset() cart.state = s return cart
self.epsilon *= self.decay if np.random.random() <= self.epsilon: return self.env.action_space.sample(), 10 else: return np.argmax(self.model.predict(state)[0][0:2]), np.argmax( self.model.predict(state)[0][2:5]) def save_model(self, filename): self.model.save_weights(filename) def load_model(self, filename): self.model.load_weights(filename) if __name__ == '__main__': env = CartPoleEnv() agent = DQN(24, 24, env) # agent.load_model('bot.h5') episode_data = [] score_data = [] episode_data_ = [] score_data_ = [] # Learning for episode in range(5000): state = env.reset() state = np.reshape(state, [1, 4]) # reshape from [[a, b]] to [a, b] for t in range(1000): action, force = agent.act(state)
def experiment(variant): from cartpole import CartPoleEnv expl_env = CartPoleEnv(mode=3) eval_env = CartPoleEnv(mode=3) num_agent = expl_env.num_agents obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.low.size from rlkit.torch.networks.graph_builders import FullGraphBuilder graph_builder_obs = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet obs_gnn_1 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) graph_builder_eval = FullGraphBuilder( input_node_dim=graph_builder_obs.output_node_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) if variant['concat_emb']: gnn_out_dim = int(obs_dim + variant['graph_kwargs']['node_dim'] * variant['graph_kwargs']['num_conv_layers']) else: gnn_out_dim = variant['graph_kwargs']['node_dim'] from rlkit.torch.networks.networks import FlattenMlp post_mlp1 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) from rlkit.torch.networks.graph_r2g_qnet2 import R2GQNet qf1 = R2GQNet( obs_gnn=obs_gnn_1, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp1, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf1 = copy.deepcopy(qf1) obs_gnn_2 = GNNNet( graph_builder_obs, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) post_mlp2 = FlattenMlp( input_size=gnn_out_dim, output_size=1, hidden_sizes=[variant['qf_kwargs']['hidden_dim']] * (variant['qf_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), ) qf2 = R2GQNet( obs_gnn=obs_gnn_2, pre_graph_builder=graph_builder_eval, obs_dim=obs_dim, action_dim=action_dim, post_mlp=post_mlp2, normalize_emb=False, output_activation=None, concat_emb=variant['concat_emb'], **variant['graph_kwargs'], ) target_qf2 = copy.deepcopy(qf2) graph_builder_ca = FullGraphBuilder( input_node_dim=obs_dim + action_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) from rlkit.torch.networks.gnn_networks import GNNNet cgca = GNNNet( graph_builder_ca, hidden_activation='lrelu0.2', output_activation='lrelu0.2', **variant['graph_kwargs'], ) from rlkit.torch.networks.networks import FlattenMlp from rlkit.torch.networks.layers import SplitLayer from rlkit.torch.policies.tanh_gaussian_policy import TanhGaussianPolicy cactor = nn.Sequential( cgca, FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['cactor_kwargs']['hidden_dim'], hidden_sizes=[variant['cactor_kwargs']['hidden_dim']] * (variant['cactor_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), nn.LeakyReLU(negative_slope=0.2), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) cactor = TanhGaussianPolicy(module=cactor) graph_builder_policy = FullGraphBuilder( input_node_dim=obs_dim, num_node=num_agent, batch_size=variant['algorithm_kwargs']['batch_size'], contain_self_loop=False) policy_n, expl_policy_n, eval_policy_n = [], [], [] for i in range(num_agent): policy = nn.Sequential( FlattenMlp( input_size=variant['graph_kwargs']['node_dim'], output_size=variant['policy_kwargs']['hidden_dim'], hidden_sizes=[variant['policy_kwargs']['hidden_dim']] * (variant['policy_kwargs']['num_layer'] - 1), hidden_activation=nn.LeakyReLU(negative_slope=0.2), output_activation=nn.LeakyReLU(negative_slope=0.2), ), SplitLayer(layers=[ nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim), nn.Linear(variant['policy_kwargs']['hidden_dim'], action_dim) ])) policy = TanhGaussianPolicy(module=policy) from rlkit.torch.policies.make_deterministic import MakeDeterministic eval_policy = MakeDeterministic(policy) if variant['random_exploration']: from rlkit.exploration_strategies.base import PolicyWrappedWithExplorationStrategy from rlkit.exploration_strategies.epsilon_greedy import EpsilonGreedy expl_policy = PolicyWrappedWithExplorationStrategy( exploration_strategy=EpsilonGreedy(expl_env.action_space, prob_random_action=1.0), policy=policy, ) else: expl_policy = policy policy_n.append(policy) expl_policy_n.append(expl_policy) eval_policy_n.append(eval_policy) from rlkit.samplers.data_collector.ma_path_collector import MAMdpPathCollector eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n, shared_encoder=obs_gnn_1) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n, shared_encoder=obs_gnn_1) from rlkit.data_management.ma_env_replay_buffer import MAEnvReplayBuffer replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) from rlkit.torch.r2g.r2g_gnn12 import R2GGNNTrainer trainer = R2GGNNTrainer(env=expl_env, qf1=qf1, target_qf1=target_qf1, qf2=qf2, target_qf2=target_qf2, cactor=cactor, policy_n=policy_n, **variant['trainer_kwargs']) from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs']) algorithm.to(ptu.device) # save init params from rlkit.core import logger snapshot = algorithm._get_snapshot() file_name = osp.join(logger._snapshot_dir, 'itr_-1.pkl') torch.save(snapshot, file_name) algorithm.train()
#print('current_discrete', current_discrete)#, next_discrete, action) discrete_states[action, current_discrete, next_discrete] += 1 if terminal: state = env.reset(initial_state) episodes += 1 step = 0 if __name__ == '__main__': args = parser().parse_args() no_intervals = args.no_intervals episodes = args.episodes ENV_NAME = "CartPole-v1" #env = gym.make(ENV_NAME) env = CartPoleEnv() action_space = env.action_space.n #env.seed(np.random.randint(0, 10000)) no_discrete_states = no_intervals * no_intervals * no_intervals * no_intervals discrete_states = np.zeros((action_space, no_discrete_states, no_discrete_states), dtype='uint32') step = 2.3 / (no_intervals - 1) for x_bound in np.arange(-1.1, 1.2 + step, step): x = random.uniform(x_bound - step, x_bound) for v_bound in np.arange(-1.1, 1.2 + step, step): v = random.uniform(v_bound - step, v_bound) for a_bound in np.arange(-1.1, 1.2 + step, step): a = random.uniform(a_bound - step, a_bound) for v_a_bound in np.arange(-1.1, 1.2 + step, step): for action in range(action_space):
from cartpole import CartPoleEnv import numpy as np cart = CartPoleEnv() cart.reset() for _ in range(1000): # Calculate the Gradients # Update Thetas # Sample u trajectory # Apply u[0] to the actual system cart.step(10) # Apply Some force # Update the New State in the Learner # Shift the Thetas # Simulate cart.render() cart.close()
#!/usr/bin/env python # coding: utf-8 # In[76]: from cartpole import CartPoleEnv import numpy as np import random import matplotlib.pyplot as plt env = CartPoleEnv() env.reset() def discretize(val, bounds, n_states): discrete_val = 0 if val <= bounds[0]: discrete_val = 0 elif val >= bounds[1]: discrete_val = n_states - 1 else: discrete_val = int( round((n_states - 1) * ((val - bounds[0]) / (bounds[1] - bounds[0])))) return discrete_val def discretize_state(vals, s_bounds, n_s): discrete_vals = [] for i in range(len(n_s)): discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i]))
#!/usr/bin/env python # coding: utf-8 # In[2]: from cartpole import CartPoleEnv import math import numpy as np env = CartPoleEnv() env.reset() def discretize(val, bounds, n_states): discrete_val = 0 if val <= bounds[0]: discrete_val = 0 elif val >= bounds[1]: discrete_val = n_states - 1 else: discrete_val = int( round((n_states - 1) * ((val - bounds[0]) / (bounds[1] - bounds[0])))) return discrete_val def discretize_state(vals, s_bounds, n_s): discrete_vals = [] for i in range(len(n_s)): discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i])) return np.array(discrete_vals, dtype=np.int)
from cartpole import CartPoleEnv env = CartPoleEnv(length=1.0) env.reset() for step in range(1000): action = 0 next_state, reward, done, _ = env.step(0) if done: print "done reward:", reward break
from cartpole import CartPoleEnv import gym import numpy as np def choose_action(state): action = 0 if state[2] > 0: action = 0 else: action = 1 return action if __name__ == "__main__": cart = CartPoleEnv() cart.reset() action = 0 # while True: # cart.render() # state, reward, end, thing = cart.step(action) # print(state) # if end: # cart.reset() # else: # action = choose_action(state) # cart.close()
from cartpole import CartPoleEnv import math import numpy as np env = CartPoleEnv() env.reset() def discretize(val, bounds, n_states): discrete_val = 0 if val <= bounds[0]: discrete_val = 0 elif val >= bounds[1]: discrete_val = n_states - 1 else: discrete_val = int( round((n_states - 1) * ((val - bounds[0]) / (bounds[1] - bounds[0])))) return discrete_val def discretize_state(vals, s_bounds, n_s): discrete_vals = [] for i in range(len(n_s)): discrete_vals.append(discretize(vals[i], s_bounds[i], n_s[i])) return np.array(discrete_vals, dtype=np.int) # polożenie, prędkość, kąt, prędkość kątowa n_s = np.array([10, 10, 10, 10])
def experiment(variant): num_agent = variant['num_agent'] from cartpole import CartPoleEnv from rlkit.envs.ma_wrappers import MAProbDiscreteEnv expl_env = CartPoleEnv(mode=4) eval_env = CartPoleEnv(mode=4) obs_dim = eval_env.observation_space.low.size action_dim = eval_env.action_space.n qf_n, cactor_n, policy_n, target_qf_n, target_cactor_n, target_policy_n, eval_policy_n, expl_policy_n = \ [], [], [], [], [], [], [], [] for i in range(num_agent): qf = FlattenMlp( input_size=(obs_dim*num_agent+action_dim*num_agent), output_size=1, **variant['qf_kwargs'] ) cactor = GumbelSoftmaxMlpPolicy( input_size=(obs_dim*num_agent+action_dim*(num_agent-1)), output_size=action_dim, **variant['cactor_kwargs'] ) policy = GumbelSoftmaxMlpPolicy( input_size=obs_dim, output_size=action_dim, **variant['policy_kwargs'] ) target_qf = copy.deepcopy(qf) target_cactor = copy.deepcopy(cactor) target_policy = copy.deepcopy(policy) eval_policy = ArgmaxDiscretePolicy(policy,use_preactivation=True) expl_policy = PolicyWrappedWithExplorationStrategy( EpsilonGreedy(expl_env.action_space), eval_policy, ) qf_n.append(qf) cactor_n.append(cactor) policy_n.append(policy) target_qf_n.append(target_qf) target_cactor_n.append(target_cactor) target_policy_n.append(target_policy) eval_policy_n.append(eval_policy) expl_policy_n.append(expl_policy) eval_path_collector = MAMdpPathCollector(eval_env, eval_policy_n) expl_path_collector = MAMdpPathCollector(expl_env, expl_policy_n) replay_buffer = MAEnvReplayBuffer(variant['replay_buffer_size'], expl_env, num_agent=num_agent) trainer = PRGTrainer( env=expl_env, qf_n=qf_n, target_qf_n=target_qf_n, policy_n=policy_n, target_policy_n=target_policy_n, cactor_n=cactor_n, target_cactor_n=target_cactor_n, **variant['trainer_kwargs'] ) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, log_path_function=get_generic_ma_path_information, **variant['algorithm_kwargs'] ) algorithm.to(ptu.device) algorithm.train()
def experiment(variant): expl_env = NormalizedBoxEnv(CartPoleEnv(mode=1)) eval_env = NormalizedBoxEnv(CartPoleEnv(mode=1)) obs_dim = expl_env.observation_space.low.size action_dim = eval_env.action_space.low.size M = variant['layer_size'] vf1 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) vf2 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) target_vf1 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) target_vf2 = FlattenMlp( input_size=obs_dim, output_size=1, hidden_sizes=[M, M], ) policy = TanhGaussianPolicy( obs_dim=obs_dim, action_dim=action_dim, hidden_sizes=[M, M], return_raw_action=True, ) eval_policy = MakeDeterministic(policy) eval_path_collector = MdpPathCollector( eval_env, eval_policy, ) expl_path_collector = MdpPathCollector( expl_env, policy, ) replay_buffer = EnvReplayBuffer( variant['replay_buffer_size'], expl_env, store_raw_action=True, ) trainer = FlowQTrainer(env=eval_env, policy=policy, vf1=vf1, vf2=vf2, target_vf1=target_vf1, target_vf2=target_vf2, **variant['trainer_kwargs']) algorithm = TorchBatchRLAlgorithm( trainer=trainer, exploration_env=expl_env, evaluation_env=eval_env, exploration_data_collector=expl_path_collector, evaluation_data_collector=eval_path_collector, replay_buffer=replay_buffer, **variant['algorithm_kwargs']) algorithm.to(ptu.device) algorithm.train()
# -*- coding: utf-8 -*- """Untitled0.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1lky0vjWP1y9GVXlg3VUukjP5nR9ry3SQ """ from cartpole import CartPoleEnv import math import numpy as np env = CartPoleEnv() env.reset() def discretize(val,bounds,n_states): discrete_val = 0 if val <= bounds[0]: discrete_val = 0 elif val >= bounds[1]: discrete_val = n_states-1 else: discrete_val = int(round((n_states-1)*((val-bounds[0])/(bounds[1]-bounds[0])))) return discrete_val def discretize_state(vals,s_bounds,n_s): discrete_vals = [] for i in range(len(n_s)): discrete_vals.append(discretize(vals[i],s_bounds[i],n_s[i])) return np.array(discrete_vals,dtype=np.int)
def main(): # Define dimensions of the networks meta_value_input_dim = STATE_DIM + TASK_CONFIG_DIM # 7 task_config_input_dim = STATE_DIM + ACTION_DIM + 1 # 7 # init meta value network with a task config network meta_value_network = MetaValueNetwork(input_size = meta_value_input_dim,hidden_size = 80,output_size = 1) task_config_network = TaskConfigNetwork(input_size = task_config_input_dim,hidden_size = 30,num_layers = 1,output_size = 3) meta_value_network.cuda() task_config_network.cuda() if os.path.exists("meta_value_network_cartpole.pkl"): meta_value_network.load_state_dict(torch.load("meta_value_network_cartpole.pkl")) print("load meta value network success") if os.path.exists("task_config_network_cartpole.pkl"): task_config_network.load_state_dict(torch.load("task_config_network_cartpole.pkl")) print("load task config network success") task_lengths = np.linspace(L_MIN,L_MAX,TASK_NUMS) datas = [] for task_length in task_lengths: data_i = {} data_i["task_length"] = task_length data_i_episode = {} for episode in range(EPISODE): task = CartPoleEnv(length = task_length) task.reset() data_i_episode["episode"] = episode # ----------------- Training ------------------ # fetch pre data samples for task config network # [task_nums,sample_nums,x+y`] actor_network = ActorNetwork(STATE_DIM,40,ACTION_DIM) actor_network.cuda() actor_network_optim = torch.optim.Adam(actor_network.parameters(),lr = 0.01) ''' if os.path.exists("actor_network.pkl"): actor_network.load_state_dict(torch.load("actor_network.pkl")) print("load actor_network success") ''' # sample pre state,action,reward for task confi pre_states,pre_actions,pre_rewards,_,_ = roll_out(actor_network,task,SAMPLE_NUMS) test_results = [] train_games = [] for step in range(STEP): # init task config [1, sample_nums,task_config] task_config size=3 pre_data_samples = torch.cat((pre_states[-9:],pre_actions[-9:],torch.Tensor(pre_rewards)[-9:]),1).unsqueeze(0) task_config = task_config_network(Variable(pre_data_samples).cuda()) # [1,3] states,actions,rewards,is_done,final_state = roll_out(actor_network,task,SAMPLE_NUMS) final_r = 0 if not is_done: value_inputs = torch.cat((Variable(final_state.unsqueeze(0)).cuda(),task_config.detach()),1) final_r = meta_value_network(value_inputs).cpu().data.numpy()[0] # train actor network actor_network_optim.zero_grad() states_var = Variable(states).cuda() actions_var = Variable(actions).cuda() task_configs = task_config.repeat(1,len(rewards)).view(-1,3) log_softmax_actions = actor_network(states_var) vs = meta_value_network(torch.cat((states_var,task_configs.detach()),1)).detach() # calculate qs qs = Variable(torch.Tensor(discount_reward(rewards,0.99,final_r))).cuda() advantages = qs - vs actor_network_loss = - torch.mean(torch.sum(log_softmax_actions*actions_var,1)* advantages) #+ entropy #+ actor_criterion(actor_y_samples,target_y) actor_network_loss.backward() torch.nn.utils.clip_grad_norm(actor_network.parameters(),0.5) actor_network_optim.step() pre_states = states pre_actions = actions pre_rewards = rewards # testing if (step + 1) % 10 == 0: # testing result = 0 test_task = CartPoleEnv(length = task.length) for test_epi in range(10): state = test_task.reset() for test_step in range(200): softmax_action = torch.exp(actor_network(Variable(torch.Tensor([state])).cuda())) #print(softmax_action.data) action = np.argmax(softmax_action.cpu().data.numpy()[0]) next_state,reward,done,_ = test_task.step(action) result += reward state = next_state if done: break aver_result = result/10.0 test_results.append(aver_result) train_games.append(task.episodes) print("task length:",task_length,"episode:",episode,"step:",step+1,"result:",aver_result) data_i_episode["test_results"] = test_results data_i_episode["train_games"] = train_games data_i["results"] = data_i_episode datas.append(data_i) save_to_json('mvn_cartpole_test_100.json', datas)