def main(argv): del argv # Unused. config = utility.load_config(LOG_DIR) policy_layers = config.policy_layers value_layers = config.value_layers env = config.env(render=True) network = config.network with tf.Session() as sess: agent = simple_ppo_agent.SimplePPOPolicy(sess, env, network, policy_layers=policy_layers, value_layers=value_layers, checkpoint=os.path.join( LOG_DIR, CHECKPOINT)) sum_reward = 0 observation = env.reset() while True: action = agent.get_action([observation]) observation, reward, done, _ = env.step(action[0]) time.sleep(0.002) sum_reward += reward if done: break tf.logging.info("reward: %s", sum_reward)
def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with config.unlocked: config.network = functools.partial(utility.define_network, config.network, config) config.policy_optimizer = getattr(tf.train, config.policy_optimizer) config.value_optimizer = getattr(tf.train, config.value_optimizer) with tf.device('/cpu:0'): batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents, env_processes) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def main(argv): del argv # Unused. config = utility.load_config(LOG_DIR) policy_layers = config.policy_layers value_layers = config.value_layers env = config.env(render=True) network = config.network with tf.Session() as sess: agent = simple_ppo_agent.SimplePPOPolicy(sess, env, network, policy_layers=policy_layers, value_layers=value_layers, checkpoint=os.path.join(LOG_DIR, CHECKPOINT)) sum_reward = 0 observation = env.reset() while True: action = agent.get_action([observation]) observation, reward, done, _ = env.step(action[0]) time.sleep(0.002) sum_reward += reward if done: break tf.logging.info("reward: %s", sum_reward)
def create_env(envid, seed, render=False): """Create minitaur or other standard gym environment.""" if 'minitaur' in envid: from pybullet_envs.minitaur.agents.scripts import utility from pybullet_envs.minitaur.agents import tools config = utility.load_config(os.path.expanduser('minitaur_config')) if 'bad' in envid: with config.unlocked: config.env.keywords['accurate_motor_model_enabled'] = False config.env.keywords['control_latency'] = .0 config.env.keywords['pd_latency'] = .0 config.env.keywords['urdf_version'] = None env = config.env(render=render) if config.max_length: env = tools.wrappers.LimitDuration(env, config.max_length) env = tools.wrappers.RangeNormalize(env) env = tools.wrappers.ClipAction(env) env = tools.wrappers.ConvertTo32Bit(env) class MySpec(object): def __init__(self, max_episode_steps): self.max_episode_steps = max_episode_steps env.spec = MySpec(1000) else: env = gym.make(envid) # Set up seed. env.seed(seed) return env
def main(_): """Create or load configuration and launch the trainer.""" utility.set_up_logging() if not FLAGS.config: raise KeyError('You must specify a configuration.') logdir = FLAGS.logdir and os.path.expanduser( os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config))) try: config = utility.load_config(logdir) except IOError: config = tools.AttrDict(getattr(configs, FLAGS.config)()) config = utility.save_config(config, logdir) for score in train(config, FLAGS.env_processes): tf.logging.info('Score {}.'.format(score))
def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True): """Recover checkpoint and render videos from it. Args: logdir: Logging directory of the trained algorithm. outdir: Directory to store rendered videos in. num_agents: Number of environments to simulate in parallel. num_episodes: Total number of episodes to simulate. checkpoint: Checkpoint name to load; defaults to most recent. env_processes: Whether to step environments in separate processes. """ config = utility.load_config(logdir) with config.unlocked: config.network = functools.partial(utility.define_network, config.network, config) config.policy_optimizer = getattr(tf.train, config.policy_optimizer) config.value_optimizer = getattr(tf.train, config.value_optimizer) with tf.device('/cpu:0'): batch_env = utility.define_batch_env( lambda: _create_environment(config, outdir), num_agents, env_processes) graph = utility.define_simulation_graph(batch_env, config.algorithm, config) total_steps = num_episodes * config.max_length loop = _define_loop(graph, total_steps) saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step')) sess_config = tf.ConfigProto(allow_soft_placement=True) sess_config.gpu_options.allow_growth = True with tf.Session(config=sess_config) as sess: utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True) for unused_score in loop.run(sess, saver, total_steps): pass batch_env.close()
def main(argv): del argv # Unused. config = utility.load_config(LOG_DIR) print(LOG_DIR) policy_layers = config.policy_layers value_layers = config.value_layers env = config.env(render=True) network = config.network with tf.Session() as sess: agent = simple_ppo_agent.SimplePPOPolicy(sess, env, network, policy_layers=policy_layers, value_layers=value_layers, checkpoint=os.path.join( LOG_DIR, CHECKPOINT)) sum_reward = 0 observation = env.reset() while True: # 驱动 command = 'd' command = bytes(command, encoding='utf8') arduino.write(command) # ppo get action action = agent.get_action([observation]) # transfer ppo action to all motor action o_action = copy.deepcopy(action) o_action = env.transform_action_to_motor_command(o_action[0]) print("----- each motor radio -----") print(o_action) print("----- each motor angle -----") pi = 3.14159265359 deg = [] i = 0 for each_rad in o_action[:2]: now = 180 * each_rad / pi # output action command = str(i) command = bytes(command, encoding='utf8') arduino.write(command) ''' command = input('请输入转动方向(+-)') command = bytes(command, encoding='utf8') arduino.write(command) ''' command = str(now) command = bytes(command, encoding='utf8') print(command) arduino.write(command) # 如需更高速度,要更改Arduino代码 command = '10000' command = bytes(command, encoding='utf8') print(command) arduino.write(command) i = i + 1 ''' msg = arduino.read(14) msg = binascii.b2a_hex(msg).decode('utf-8') print(msg) ''' observation, reward, done, _ = env.step(action[0]) ''' replace observation with a real observation ''' time.sleep(0.002) sum_reward += reward if done: break tf.logging.info("reward: %s", sum_reward)