def worker(id, sac_trainer, ENV, rewards_queue, replay_buffer, max_episodes, max_steps, batch_size, explore_steps, \ update_itr, action_itr, AUTO_ENTROPY, DETERMINISTIC, hidden_dim, model_path): ''' the function for sampling with multi-processing ''' with torch.cuda.device(id % torch.cuda.device_count()): sac_trainer.to_cuda() print(sac_trainer, replay_buffer) # sac_tainer are not the same, but all networks and optimizers in it are the same; replay buffer is the same one. if ENV == 'Reacher': NUM_JOINTS=2 LINK_LENGTH=[200, 140] INI_JOING_ANGLES=[0.1, 0.1] SCREEN_SIZE=1000 SPARSE_REWARD=False SCREEN_SHOT=False action_range = 10.0 env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) action_dim = env.num_actions state_dim = env.num_observations elif ENV == 'Pendulum': env = NormalizedActions(gym.make("Pendulum-v0")) action_dim = env.action_space.shape[0] state_dim = env.observation_space.shape[0] action_range=1. frame_idx=0 rewards=[] # training loop for eps in range(max_episodes): episode_reward = 0 if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action(state, deterministic = DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() try: if ENV == 'Reacher': next_state, reward, done, _ = env.step(action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 # if len(replay_buffer) > batch_size: if replay_buffer.get_length() > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: # plot(rewards, id) sac_trainer.save_model(model_path) if done: break print('Worker: ', id, '| Episode: ', eps, '| Episode Reward: ', episode_reward) # if len(rewards) == 0: # rewards.append(episode_reward) # else: # rewards.append(rewards[-1] * 0.9 + episode_reward * 0.1) rewards_queue.put(episode_reward) sac_trainer.save_model(model_path)
if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() episode_reward = 0 for step in range(max_steps): # action = qt_opt.policy.act(state) action = qt_opt.cem_optimal_action(state) if ENV == 'Reacher': next_state, reward, done, _ = env.step( action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() episode_reward += reward replay_buffer.push(state, action, reward, next_state, done) state = next_state if len(replay_buffer) > batch_size: qt_opt.update(batch_size) qt_opt.save_model(model_path) episode_rewards.append(episode_reward) if i_episode % 10 == 0: plot(episode_rewards) print('Episode: {} | Reward: {}'.format(i_episode, episode_reward))
def worker(id, ): # thread could read global variables ''' the function for sampling with multi-threading ''' print(sac_trainer, replay_buffer) if ENV == 'Reacher': env=Reacher(screen_size=SCREEN_SIZE, num_joints=NUM_JOINTS, link_lengths = LINK_LENGTH, \ ini_joint_angles=INI_JOING_ANGLES, target_pos = [369,430], render=True, change_goal=False) elif ENV == 'Pendulum': env = NormalizedActions(gym.make("Pendulum-v0")) print(env) frame_idx = 0 rewards = [] # training loop for eps in range(max_episodes): episode_reward = 0 if ENV == 'Reacher': state = env.reset(SCREEN_SHOT) elif ENV == 'Pendulum': state = env.reset() for step in range(max_steps): if frame_idx > explore_steps: action = sac_trainer.policy_net.get_action( state, deterministic=DETERMINISTIC) else: action = sac_trainer.policy_net.sample_action() try: if ENV == 'Reacher': next_state, reward, done, _ = env.step( action, SPARSE_REWARD, SCREEN_SHOT) elif ENV == 'Pendulum': next_state, reward, done, _ = env.step(action) env.render() except KeyboardInterrupt: print('Finished') sac_trainer.save_model(model_path) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward frame_idx += 1 if len(replay_buffer) > batch_size: for i in range(update_itr): _ = sac_trainer.update(batch_size, reward_scale=10., auto_entropy=AUTO_ENTROPY, target_entropy=-1. * action_dim) if eps % 10 == 0 and eps > 0: plot(rewards, id) sac_trainer.save_model(model_path) if done: break print('Episode: ', eps, '| Episode Reward: ', episode_reward) # if len(rewards) == 0: rewards.append(episode_reward) # else: rewards.append(rewards[-1]*0.9+episode_reward*0.1) sac_trainer.save_model(model_path)