def __call__(self, env, policy, debug=False, visualize=False): episode_memory = queue() observation = None result = [] for episode in range(self.validate_episodes): # reset at the start of episode episode_memory.clear() observation = env.reset() episode_memory.append(observation) observation = episode_memory.getObservation(self.window_length, observation, self.pic) episode_steps = 0 episode_reward = 0. assert observation is not None # start episode done = False while not done and (episode_steps <= self.max_episode_length or not self.max_episode_length): action = policy(observation) observation, reward, done, info = env.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(self.window_length, observation, self.pic) if visualize: if self.bullet: import pybullet pybullet.resetDebugVisualizerCamera(cameraDistance=10, cameraYaw=0, cameraPitch=-6.6,cameraTargetPosition=[10,0,0]) env.render() episode_reward += reward episode_steps += 1 result.append(episode_reward) if debug: prRed('[Evaluate] reward:{}'.format(result)) return result
def __call__(self, env, policy, debug=False, visualize=False, window_length=1): episode_memory = queue() observation = None result = [] for episode in range(self.num_episodes): # reset at the start of episode episode_memory.clear() observation = env.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation) episode_steps = 0 episode_reward = 0. assert observation is not None # start episode done = False while not done: action = policy(observation) observation, reward, done, info = env.step(action) episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation) if self.max_episode_length and episode_steps >= self.max_episode_length - 1: done = True if visualize: env.render() # update episode_reward += reward episode_steps += 1 result.append(episode_reward) return np.mean(result)
def __call__(self, env, policy, debug=False, visualize=False): episode_memory = queue() observation = None result = [] for episode in range(self.validate_episodes): # reset at the start of episode episode_memory.clear() observation = env.reset() episode_memory.append(observation) observation = episode_memory.getObservation( self.window_length, observation) episode_steps = 0 episode_reward = 0. assert observation is not None # start episode done = False while not done and (episode_steps <= self.max_episode_length or not self.max_episode_length): action = policy(observation) observation, reward, done, info = env.step(action) episode_memory.append(observation) observation = episode_memory.getObservation( self.window_length, observation) if visualize: if self.bullet: import pybullet pybullet.resetDebugVisualizerCamera( cameraDistance=10, cameraYaw=0, cameraPitch=-6.6, cameraTargetPosition=[10, 0, 0]) env.render() episode_reward += reward episode_steps += 1 result.append(episode_reward) if debug: prRed('[Evaluate] reward:{}'.format(result)) return result
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis, args.atari) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume validate_episodes = args.validate_episodes if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: # print("observation shape:", observation.shape) action = agent.select_action(observation, noise_level=noise_level) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation = ", observation) # print("reward = ", reward) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save # if args.env == "Paint": # writer.add_image(str(episode) + '.png', env.canvas) if step > args.warmup: # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False) if debug: prRed( 'Step_{:07d}: mean_reward:{} reward_var:{}'.format( step, np.mean(validate_reward), np.var(validate_reward))) writer.add_scalar('validate/reward', np.mean(validate_reward), step) writer.add_image(str(step) + '.png', env.canvas) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(traintimes): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num)
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis, args.atari) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume validate_episodes = args.validate_episodes if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: # print("observation shape:", observation.shape) action = agent.select_action(observation, noise_level=noise_level) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation = ", observation) # print("reward = ", reward) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save # if args.env == "Paint": # writer.add_image(str(episode) + '.png', env.canvas) if step > args.warmup: # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(fenv, agent.select_action, debug=debug, visualize=False) if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward))) writer.add_scalar('validate/reward', np.mean(validate_reward), step) if args.env == "Paint": writer.add_image(str(step) + '.png', env.canvas) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(traintimes): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num)
def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None max_reward = -100000. episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. while step < num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = env.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level)
def train(num_iterations, agent, env): fenv = fastenv(env, args.action_repeat) window_length = args.window_length save_interval = args.save_interval debug = args.debug output = args.output time_stamp = 0. log = 0 step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = args.noise_level * random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if done: # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{} in {}'.format( save_num, args.output)) agent.save_model(output, save_num) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = args.noise_level * random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume ace = args.ace validate_episodes = args.validate_episodes # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf if ace != 1: ensemble = ACE(nb_status, nb_actions, args) if resume is not None: print('load weight') if ace != 1: ensemble.load(output) agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) agent.save_model(output, 0) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation shape = ", np.shape(observation)) # print("observation = ", observation) # print("reward = ", reward) # exit() # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num) if ace != 1: ensemble.append(output, save_num) # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False) if debug: prRed('Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward), np.var(validate_reward))) if ace != 1 and save_num >= 1: validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False) if debug: prRed('ACE Step_{:07d}: mean_reward:{} reward_var:{}'.format(step, np.mean(validate_reward2), np.var(validate_reward2))) # for i in range(validate_episodes): # validate_num += 1 writer.add_scalar('validate/reward', np.mean(validate_reward), step) if ace != 1 and save_num >= 1: writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 sigint_handler(0, 0)
def train(num_iterations, agent, env, evaluate, bullet): fenv = fastenv(env, args.action_repeat, args.vis) window_length = args.window_length validate_interval = args.validate_interval save_interval = args.save_interval max_episode_length = args.max_episode_length // args.action_repeat debug = args.debug visualize = args.vis traintimes = args.traintimes output = args.output resume = args.resume ace = args.ace validate_episodes = args.validate_episodes # [optional] Actor-Critic Ensemble https://arxiv.org/pdf/1712.08987.pdf if ace != 1: ensemble = ACE(nb_status, nb_actions, args) if resume is not None: print('load weight') if ace != 1: ensemble.load(output) agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) agent.save_model(output, 0) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation( window_length, observation, args.pic) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation, args.pic) # print("observation shape = ", np.shape(observation)) # print("observation = ", observation) # print("reward = ", reward) # exit() # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if (done or (episode_steps >= max_episode_length and max_episode_length)): # end of episode # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{}'.format(save_num)) agent.save_model(output, save_num) if ace != 1: ensemble.append(output, save_num) # [optional] evaluate if episode > 0 and validate_interval > 0 and episode % validate_interval == 0: validate_reward = evaluate(env, agent.select_action, debug=debug, visualize=False) if debug: prRed( 'Step_{:07d}: mean_reward:{} reward_var:{}'.format( step, np.mean(validate_reward), np.var(validate_reward))) if ace != 1 and save_num >= 1: validate_reward2 = evaluate(env, ensemble, debug=debug, visualize=False) if debug: prRed( 'ACE Step_{:07d}: mean_reward:{} reward_var:{}' .format(step, np.mean(validate_reward2), np.var(validate_reward2))) # for i in range(validate_episodes): # validate_num += 1 writer.add_scalar('validate/reward', np.mean(validate_reward), step) if ace != 1 and save_num >= 1: writer.add_scalar('validate/ACE_reward', np.mean(validate_reward2), step) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} noise_scale:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 sigint_handler(0, 0)
def train(num_iterations, agent, env): fenv = fastenv(env, args.action_repeat) window_length = args.window_length save_interval = args.save_interval debug = args.debug output = args.output time_stamp = 0. log = 0 step = episode = episode_steps = 0 episode_reward = 0. observation = None episode_num = 0 episode_memory = queue() noise_level = args.noise_level * random.uniform(0, 1) / 2. save_num = 0 # validate_num = 0 while step <= num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = fenv.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # print('step = ', step) # env response with next_observation, reward, terminate_info observation, reward, done, info = fenv.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if done: # [optional] save if step > args.warmup: if episode > 0 and save_interval > 0 and episode % save_interval == 0: save_num += 1 if debug: prRed('[Save model] #{} in {}'.format(save_num, args.output)) agent.save_model(output, save_num) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(episode_steps): if step > args.warmup: log += 1 # print('updating', i) Q, value_loss = agent.update_policy() writer.add_scalar('train/Q', Q.data.cpu().numpy(), log) writer.add_scalar('train/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.3f} steps:{} real noise_level:{:.2f} interval_time:{:.2f} train_time:{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('train/train_reward', episode_reward, episode) # reset noise_level = args.noise_level * random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1
def train(num_iterations, agent, env, evaluate, validate_interval, output, window_length, max_episode_length=None, debug=False, visualize=False, traintimes=None, resume=None): if resume is not None: print('load weight') agent.load_weights(output) agent.memory.load(output) def sigint_handler(signum, frame): print('memory saving...'), agent.memory.save(output) print('done') exit() signal.signal(signal.SIGINT, sigint_handler) time_stamp = 0. log = 0 agent.is_training = True step = episode = episode_steps = 0 episode_reward = 0. observation = None max_reward = -100000. episode_num = 0 episode_memory = queue() noise_level = random.uniform(0, 1) / 2. while step < num_iterations: # reset if it is the start of episode if observation is None: episode_memory.clear() observation = env.reset() episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) agent.reset(observation) # agent pick action ... if step <= args.warmup and resume is None: action = agent.random_action() else: action = agent.select_action(observation, noise_level=noise_level) # env response with next_observation, reward, terminate_info # print("action = ", action) observation, reward, done, info = env.step(action) episode_memory.append(observation) observation = episode_memory.getObservation(window_length, observation) # print("observation shape = ", np.shape(observation)) # print("observation = ", observation) # print("reward = ", reward) # exit() # agent observe and update policy agent.observe(reward, observation, done) # update step += 1 episode_steps += 1 episode_reward += reward if done or (episode_steps >= max_episode_length - 1 and max_episode_length): # end of episode # [optional] evaluate if evaluate is not None and validate_interval > 0 and episode % validate_interval == 0: policy = lambda x: agent.select_action(x, decay_epsilon=False, noise_level=0) validate_reward = evaluate(env, policy, debug=False, visualize=False, window_length=window_length) writer.add_scalar('data/validate_reward', validate_reward, episode / validate_interval) if debug: prRed('[Evaluate and save] Step_{:07d}: mean_reward:{}'.format(step, validate_reward)) if validate_reward > max_reward and step != 0: max_reward = validate_reward agent.save_model(output) train_time_interval = time.time() - time_stamp time_stamp = time.time() for i in range(traintimes): log += 1 if step > args.warmup: Q, value_loss = agent.update_policy() writer.add_scalar('data/Q', Q.data.cpu().numpy(), log) writer.add_scalar('data/critic_loss', value_loss.data.cpu().numpy(), log) if debug: prBlack('#{}: train_reward:{:.2f} steps:{} noise:{:.2f} time:{:.2f},{:.2f}' \ .format(episode,episode_reward,step,noise_level,train_time_interval,time.time()-time_stamp)) time_stamp = time.time() writer.add_scalar('data/train_reward', episode_reward, episode) # reset noise_level = random.uniform(0, 1) / 2. episode_num += 1 observation = None episode_steps = 0 episode_reward = 0. episode += 1 sigint_handler(0, 0)