print("Number of episodes : " + str(args.num_episodes)) for i_episode in range(args.num_episodes): ''' Here, num_episodes correspond to the generations in Algo 1. In every generation, the population is evaluated, ranked, mutated, and re-instered into population ''' evo.evaluate_pop() evo.rank_pop_selection_mutation() print("Evolutionary Fitness = " + str(evo.best_policy.fitness)) ''' ############# The DDPG part ############# ''' state = torch.Tensor([env.reset()]) # algo line 6 ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() episode_reward = 0 for t in range(args.num_steps): # line 7 # forward pass through the actor network action = agent.select_action(state, ounoise) # line 8 next_state, reward, done, _ = env.step(action.numpy()[0]) # line 9 episode_reward += reward action = torch.Tensor(action) mask = torch.Tensor([not done]) next_state = torch.Tensor([next_state])
agent.actor_target = nn.DataParallel(agent.actor_target) agent.actor_perturbed = nn.DataParallel(agent.actor_perturbed) agent.critic = nn.DataParallel(agent.critic) agent.critic_target = nn.DataParallel(agent.critic_target) agent.actor.to(device) agent.actor_target.to(device) agent.actor_perturbed.to(device) agent.critic.to(device) agent.critic_target.to(device) end_str = "_{}_{}".format(args.env_name, args.model_suffix) agent.load_model("models/ddpg_actor" + end_str, "models/ddpg_critic" + end_str) while True: episode_reward = 0 state = torch.Tensor([env.reset()]).to(device) env.render() while True: action = agent.select_action(state, None, None) next_state, reward, done, _ = env.step(action.cpu().numpy()[0]) env.render() episode_reward += reward #action = torch.Tensor(action).to(device) mask = torch.Tensor([not done]).to(device) next_state = torch.Tensor([next_state]).to(device) reward = torch.Tensor([reward]).to(device) state = next_state print("Reward: {}; Episode reward: {}".format(reward, episode_reward))
offset_time = args.offset_time # 0 start_time = time.time() Qevaluations=[] goal_path = cur_path + '/' +str(args.flow_family)+ '/' +str(args.env_name) + '/' + 'seed_' + str(args.seed) os.makedirs(goal_path) print(start_time,args.env_name) if 'dataframe' in args: df = args.dataframe else: df = pd.DataFrame(columns=["total_steps", "score_eval", "time_so_far"]) for i_episode in itertools.count(start_episode): episode_reward = 0 episode_steps = 0 done = False state = env.reset() while not done: if args.start_steps > total_numsteps: action = np.random.uniform(env.action_space.low,env.action_space.high,env.action_space.shape[0]) # Sample random action else: action = agent.select_action(state) # Sample action from policy if len(memory) > args.start_steps: # Number of updates per step in environment for i in range(args.updates_per_step): # Update parameters of all the networks (critic_1_loss, critic_2_loss, policy_loss, _, _, policy_info, )= agent.update_parameters(memory, args.batch_size, updates)
env.observation_space.shape[0], env.action_space) memory = ReplayMemory(args.replay_size) ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None rewards = [] total_numsteps = 0 updates = 0 for i_episode in range(args.num_episodes): state = torch.Tensor([env.reset()]) if args.ou_noise: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max( 0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG": agent.perturb_actor_parameters(param_noise) episode_reward = 0 while True: action = agent.select_action(state, ounoise, param_noise) next_state, reward, done, _ = env.step(action.numpy()[0]) total_numsteps += 1
# Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) writer = SummaryWriter() # Memory memory = ReplayMemory(args.replay_size) # Training Loop rewards = [] total_numsteps = 0 updates = 0 for i_episode in itertools.count(): state = env.reset() episode_reward = 0 while True: action = agent.select_action(state) # Sample action from policy next_state, reward, done, _ = env.step(action) # Step mask = not done # 1 for not done and 0 for done memory.push(state, action, reward, next_state, mask) # Append transition to memory if len(memory) > args.batch_size: for i in range(args.updates_per_step): # Number of updates per step in environment # Sample a batch from memory state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args.batch_size) # Update parameters of all the networks value_loss, critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(state_batch, action_batch, reward_batch, next_state_batch, mask_batch, updates)
a_noise.reset() if p_noise is not None: a.perturb_actor_parameters(param_noise) total_steps = 0 print(base_dir) if args.num_steps is not None: assert args.num_epochs is None nb_epochs = int( args.num_steps) // (args.num_epochs_cycles * args.num_rollout_steps) else: nb_epochs = 500 state = agent.Tensor([env.reset()]) episode_reward = 0 agent.train() reset_noise(agent, noise, param_noise) if args.visualize: vis = visdom.Visdom(env=base_dir) else: vis = None train_steps = 0 episode_timesteps = 0 for epoch in trange(nb_epochs): for cycle in range(args.num_epochs_cycles):
env_name = args.env try: env = NormalizedActions(envs.env_list[env_name](render=args.render)) except TypeError as err: print('no argument render, assumping env.render will just work') env = NormalizedActions(envs.env_list[env_name]()) assert np.any(np.abs(env.action_space.low) <= 1.) and np.any( np.abs(env.action_space.high) <= 1.), 'Action space not normalizd' if args.record: env = gym.wrappers.Monitor(env, './data/vid/mpc/{}-{}'.format( env_name, args.frame), force=True) env.reset() env.seed(args.seed) np.random.seed(args.seed) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] device = 'cpu' if torch.cuda.is_available(): device = 'cuda:0'
writer_test = SummaryWriter(log_dir='runs/' + folder + 'run_' + str(i_run) + '/test') # Setup Replay Memory memory = ReplayMemory(args.replay_size) # TRAINING LOOP total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0 rewards = [] for i_episode in itertools.count(1): print(updates) ts = time.time() episode_reward = episode_steps = 0 done = False state = env.reset() if cnn: state_buffer = StateBuffer(args.state_buffer_size, state) state = state_buffer.get_state() critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0 while not done: # if cnn: # writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps) if i_episode < args.warm_up_episode: action = env.action_space.sample() # Sample random action else: action = agent.select_action( state) # Sample action from policy
if args.layers == 1: policy = SingleLayerPolicy(args.hidden_size, env.observation_space.shape[0], env.action_space) elif args.layers== 2: policy = TwoLayerPolicy(args.hidden_size, env.observation_space.shape[0], env.action_space) agent = LPO(args.hidden_size, env.observation_space.shape[0], env.action_space, args.constraint_size, policy) dir = 'ckpt_' + env_name if not os.path.exists(dir): os.mkdir(dir) # change this. # sample 1 trajectory of n steps (n = numsteps: arg) # for each of the n steps, sample k trajectories (k: arg) # create constraints for these n*k constraints for i_episode in range(args.num_rollouts): start_state = env.reset() state = torch.Tensor([start_state]) entropies = [] log_probs = [] states = [start_state] states_bytes = [start_state.tobytes()] actions = [] rewards = [] for t in range(args.num_steps): action, log_prob, entropy = agent.select_action(state) actions.append(action) action = action.cpu() next_state, reward, done, _ = env.step(action.numpy()[0])
ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None param_noise = AdaptiveParamNoiseSpec( initial_stddev=0.05, desired_action_stddev=args.noise_scale, adaptation_coefficient=1.05) if args.param_noise else None else: ounoise = None param_noise = None rewards = [] total_numsteps = 0 updates = 0 device = torch.device('cuda') num_steps = args.num_frames // args.num_processes state = torch.Tensor(env.reset()) episode_rewards = torch.zeros(args.num_processes, 1).to(device) final_rewards = torch.zeros(args.num_processes, 1).to(device) start = time.time() for step in range(int(num_steps)): ''' if args.ou_noise and ounoise is not None: ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end - i_episode) / args.exploration_end + args.final_noise_scale ounoise.reset() if args.param_noise and args.algo == "DDPG" and param_noise is not None: agent.perturb_actor_parameters(param_noise) ''' episode_reward = 0
'/tmp/{}-experiment'.format(env_name), force=True) env.seed(args.seed) torch.manual_seed(args.seed) np.random.seed(args.seed) agent = REINFORCE(args.hidden_size, env.observation_space.shape[0], env.action_space) dir = 'ckpt_' + env_name if not os.path.exists(dir): os.mkdir(dir) for i_episode in range(args.num_episodes): env_reset = np.expand_dims(env.reset(), 0) state = torch.Tensor(env_reset) # print(state) entropies = [] log_probs = [] rewards = [] for t in range(args.num_steps): action, log_prob, entropy = agent.select_action(state) action = action.cpu() next_state, reward, done, _ = env.step(action.numpy()[0]) if args.render: env.render()