def main(): env = KukaDiverseObjectEnv(renders=True, isDiscrete=False) policy = ContinuousDownwardBiasPolicy() while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 while not done: env.render(mode='human') act = policy.sample_action(obs, .1) print("Action") print(act) obs, rew, done, _ = env.step([0, 0, 0, 0, 0]) episode_rew += rew print("Episode reward", episode_rew)
def main(): env = KukaDiverseObjectEnv(renders=True, isDiscrete=False) policy = ContinuousDownwardBiasPolicy() while True: obs, done = env.reset(), False print("===================================") print("obs") print(obs) episode_rew = 0 while not done: env.render() act = policy.sample_action(obs, .1) print("Action") print(act) obs, rew, done, _ = env.step([0, 0, 0, 0, 0]) episode_rew += rew print("Episode reward", episode_rew)
ep_reward_list = [] avg_reward_list = [] actor_loss, critic_loss = 0, 0 best_score = -np.inf print('Main training loop') for episode in range(start_episode, MAX_EPISODES): obsv = env.reset() state = np.asarray( obsv, dtype=np.float32) / 255.0 # convert into float array episodic_reward = 0 frames = [] steps = 0 while True: if episode > MAX_EPISODES - 3: frames.append(env.render(mode='rgb_array')) # take an action as per the policy if episode < RAND_EPS: # explore for some episodes action = env.action_space.sample() else: action = agent.policy(state) # obtain next state and rewards next_obsv, reward, done, info = env.step(action) next_state = np.asarray( next_obsv, dtype=np.float32) / 255.0 # convert into float array #tb_img = np.reshape(next_state, (-1, 48, 48, 3)) # for tensorboard