if train_model: trainer.write_text(summary_writer, 'Hyperparameters', options, steps) while steps <= max_steps or not train_model: if env.global_done: info = env.reset(train_mode=train_model, progress=get_progress())[brain_name] trainer.reset_buffers(info, total=True) # Decide and take an action new_info = trainer.take_action(info, env, brain_name, steps, normalize) info = new_info trainer.process_experiences(info, time_horizon, gamma, lambd) if len(trainer.training_buffer['actions']) > buffer_size and train_model: # Perform gradient descent with experience buffer trainer.update_model(batch_size, num_epoch) if steps % summary_freq == 0 and steps != 0 and train_model: # Write training statistics to tensorboard. trainer.write_summary(summary_writer, steps, env._curriculum.lesson_number) if steps % save_freq == 0 and steps != 0 and train_model: # Save Tensorflow model save_model(sess, model_path=model_path, steps=steps, saver=saver) if train_model: steps += 1 sess.run(ppo_model.increment_step) if len(trainer.stats['cumulative_reward']) > 0: mean_reward = np.mean(trainer.stats['cumulative_reward']) sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward}) last_reward = sess.run(ppo_model.last_reward) # Final save Tensorflow model if steps != 0 and train_model: save_model(sess, model_path=model_path, steps=steps, saver=saver) env.close() graph_name = (env_name.strip()
sess.run(init) steps = sess.run(ppo_model.global_step) summary_writer = tf.summary.FileWriter(summary_path) info = env.reset(train_mode=train_model)[brain_name] trainer = Trainer(ppo_model, sess, info, is_continuous, use_observations, use_states) while steps <= max_steps or not train_model: if env.global_done: info = env.reset(train_mode=train_model)[brain_name] # Decide and take an action new_info = trainer.take_action(info, env, brain_name) info = new_info trainer.process_experiences(info, time_horizon, gamma, lambd) if len(trainer.training_buffer['actions'] ) > buffer_size and train_model: # Perform gradient descent with experience buffer trainer.update_model(batch_size, num_epoch) if steps % summary_freq == 0 and steps != 0 and train_model: # Write training statistics to tensorboard. trainer.write_summary(summary_writer, steps) if steps % save_freq == 0 and steps != 0 and train_model: # Save Tensorflow model save_model(sess, model_path=model_path, steps=steps, saver=saver) steps += 1 sess.run(ppo_model.increment_step) # Final save Tensorflow model if steps != 0 and train_model: save_model(sess, model_path=model_path, steps=steps, saver=saver) env.close() export_graph(model_path, env_name)
trainer.reset_buffers(info, total=True) # Decide and take an action info = trainer.take_action(info, env, brain_name, steps, normalize, stochastic=True) trainer.process_experiences(info, time_horizon, gamma, lambd) if len(trainer.training_buffer['actions'] ) > buffer_size and train_model: # Perform gradient descent with experience buffer trainer.update_model(batch_size, num_epoch) if steps % summary_freq == 0 and steps != 0 and train_model: # Write training statistics to tensorboard. trainer.write_summary(summary_writer, steps, episode_number) if steps % save_freq == 0 and steps != 0 and train_model: # Save Tensorflow model save_model(sess, model_path=model_path, steps=steps, saver=saver) if train_model: steps += 1 sess.run(ppo_model.increment_step) if len(trainer.stats['cumulative_reward']) > 0: mean_reward = np.mean(trainer.stats['cumulative_reward']) sess.run(ppo_model.update_reward, feed_dict={ppo_model.new_reward: mean_reward}) last_reward = sess.run(ppo_model.last_reward) if not watcher_started and render: watcher = threading.Thread(target=watch, args=(tf.get_default_session(), )) watcher.start()