def train(num_episodes=1000, save_every=100, checkpoint_dir="checkpoints", tensorboard_dir="tensorboard", tboard_every=10): pol = Policy() writer = tf.contrib.summary.create_file_writer(tensorboard_dir) for j in range(1, num_episodes+1): random_secret = random.randint(0, config.max_guesses - 1) e = Episode(pol, random_secret) history = e.generate() print("Episode length: {}".format(len(history))) G = -1 optimizer = \ tf.train.GradientDescentOptimizer( learning_rate=config.reinforce_alpha*G) for i in reversed(range(1, len(history))): history_so_far = history[:i] next_action, _ = history[i] with tfe.GradientTape() as tape: action_logits = pol(history_so_far, with_softmax=False) loss = tf.nn.softmax_cross_entropy_with_logits_v2( labels=tf.one_hot( tf.convert_to_tensor([next_action]), config.max_guesses), logits=action_logits ) grads = tape.gradient(loss, pol.variables) optimizer.apply_gradients(zip(grads, pol.variables)) G -= 1 optimizer._learning_rate = G * config.reinforce_alpha optimizer._learning_rate_tensor = None # hack. Should be able to pass a callable as learning_rate, see # https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer#args # can I perhaps submit a PR to fix this bug? sys.stdout.write("{}/{}\r".format(len(history)-i, len(history))) if j % save_every == 0 or j == num_episodes: saver = tfe.Saver(pol.named_variables) save_path = os.path.join(checkpoint_dir, "episode{}".format( str(j).zfill(len(str(num_episodes))))) saver.save(save_path) if j % tboard_every == 0: with writer.as_default(): with tf.contrib.summary.always_record_summaries(): tf.contrib.summary.scalar('total_return', tf.convert_to_tensor([G]), step=j)
def load_policies(directory): directory = f"saved_models/{directory}" policies = {} for file in os.listdir(directory): model = tf.keras.models.load_model(f"{directory}/{file}") pol = Policy(hex_config["size"]) pol.model = model policies[file] = pol return policies
def __init__(self): self.episodes = config["episodes"] self.amount_of_players = config["amount_of_players"] self.epsilon = config["epsilon"] self.epsilon_decay_rate = self.epsilon / self.episodes self.states = [] self.distributions = [] self.rewards = [] self.policy = Policy(hex_config["size"] ** 2)
def __init__(self, env): # Load your Model here self.sess = tf.Session() self.saver = tf.train.import_meta_graph('policy_model/.meta') self.action_size = env.action_space.shape[0] self.policy = Policy(env.observation_space.shape[0], self.action_size, 0.003, 10, -1.0, None) self.saver.restore(self.sess, tf.train.latest_checkpoint('policy_model/'))
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, clipping_range): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ # saver = tf.train.Saver() killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, clipping_range) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False # saver.save(policy.sess, 'model_save', global_step=500) logger.close() policy.close_sess() val_func.close_sess()
def load_policy(world, data): n = world.width * world.height if data == 'RAND': table = np.full((n, 4), 1. / 4.) return Policy(table) return None
def load_policy(file_name): encoder = Encoder(in_channels = h.in_channels, feature_dim = h.feature_dim) policy = Policy(encoder = encoder, feature_dim = h.feature_dim, num_actions = 15) policy.cuda() policy.load_state_dict(torch.load(MODEL_PATH + file_name + '.pt')["policy_state_dict"]) policy.cuda() return policy
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #TODO Find out what this does #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def record(): ''' This function generates a gif file for a single episode. This process may take some time. To watch the non-stop game play, please run the test() function. ''' save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" figure_path = FIGURE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" list_obs = [] list_reward = [] obs_mean_std = np.load(save_path + "obs_mean_std.npz") obs_mean = obs_mean_std["obs_mean"] obs_std = obs_mean_std["obs_std"] # Create environment. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Build models. policy = Policy(obs_space, action_space, is_training=False) with tf.Session() as sess: # Load variables. saver_policy = tf.train.Saver(policy.trainable_variables) saver_policy.restore(sess, save_path + "policy") total_reward = 0 obs = env.reset() while True: list_obs.append(obs) list_reward.append(total_reward) env.render() # Get observation. obs = (obs - obs_mean) / obs_std # Get action. action = sess.run( policy.action, feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])}) action = np.squeeze(action, (0, 1)) # Interact with the environment. obs, reward, done, _ = env.step(action) total_reward += reward if done: list_obs.append(obs) list_reward.append(total_reward) break env.close() # Record the gameplay. imageio.mimsave( figure_path + "gameplay.gif", [plot_obs(obs, reward) for obs, reward in zip(list_obs, list_reward)], fps=30)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, False) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name, True) arg = [obs_dim, act_dim, kl_targ, time_state, env_name] policy = Policy(obs_dim, act_dim, kl_targ, env_name, True) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, arg, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def __init__(self, domain: Assignment1Domain, epsilon: Optional[float] = 0.01, max_iterations: Optional[float] = 1000, gamma: Optional[float] = 0.9): self._domain = domain self._policy = Policy(domain) self._gamma = gamma self._epsilon = epsilon self._max_iterations = max_iterations
def __init__(self, state_size, action_size, sample_num): sess = tf.Session() self.policy = Policy(sess, state_size, action_size, sample_num) self.state_batch = [] self.action_batch = [] self.reward_list = [] self.step_list = [] self.weight_bach = [] self.sample_num = sample_num sess.run(tf.global_variables_initializer())
def __init__(self, policy_params, env_name, noise): self.env = gym.make(env_name) self.transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize((128, 128)), # transforms.Grayscale(), transforms.ToTensor() ]) self.noise = SharedNoiseTable(noise) self.policy = Policy(**policy_params)
def test_save_restore(self): pol = Policy() episode = [(0, 0), (1, 0), (2, 3)] expected = pol(episode).numpy() with tempfile.TemporaryDirectory() as tdir: path = os.path.join(tdir, "checkpt") saver = tfe.Saver(pol.named_variables) saver.save(path) pol2 = Policy() def diff(): actual = pol2(episode).numpy() return np.linalg.norm(actual-expected) self.assertGreater(diff(), 0.0001) saver = tfe.Saver(pol2.named_variables) saver.restore(path) self.assertGreaterEqual(0.00001, diff())
def init(self): with open('../config/config.yaml') as f: _, config_features, _ = yaml.load_all(f) self.features = config_features self.redis = redis.StrictRedis('localhost') #self.redis.flushdb() self.redis.set_response_callback('HGETALL', self.hgetall_custom_callback) self.policy = Policy(self.redis) self.add_default_policy()
def from_saved_model(self, modelpath, hot_one=True): state = torch.load(realpath(modelpath)) self.policy = Policy(state["params"]["layers"], device=cpu) self.policy.load_state_dict(state["state_dict"]) self.hot_one = hot_one self.displayname = state["params"][ "rewarder"].__class__.__name__ if "rewarder" in state[ "params"].keys() else "Saved model" self.loaded = "policy"
def __init__(self, name, globalP): self.env = Toy() self.name = name self.policy = Policy(name + '/Policy', env=self.env, state_shape=self.env.observation_shape, n_actions=16) self.policy.build() self.pull_global_op = get_pull_global(globalP, self.policy) self.update_global_op = get_update_global(globalP, self.policy)
def run_wrapper(model_dir, mode, input_frame, num_intentions=3, scale_x=1, scale_z=1, rate=28): rospy.init_node("joy_controller") controller = Controller(mode, scale_x, scale_z, rate) policy = Policy(mode, input_frame, 2, model_dir, num_intentions) controller.execute(policy)
def __init__(self, *args, **kwargs): super(QLearning, self).__init__(*args) self.max_episodes = kwargs.get("max_episodes", 200) self.alpha = kwargs.get("alpha", 0.9) self.Q = defaultdict(lambda: 0) self.epsilon = kwargs.get("epsilon", 0.8) self.V_evaluator = V_evaluator( self.environment, Policy(self.environment, self.Q), lambda state, action: self.Q[(state, action)])
def main(): """Builds a Policy object out of an inventory and policy file and optionally generates reachability tables in HTML or CSV formats.""" parser = argparse.ArgumentParser( description= 'Liest Policies aus einer Datei und übersetzt sie wahlweise in HTML oder CSV.' ) parser.add_argument( 'files', metavar='FILE', nargs='+', help= 'Either an inventory file followed by a policy file, or a single file that combines both.' ) parser.add_argument('--html', dest='generate_html', action='store_const', const=True, default=False, help='Generate the html file.') parser.add_argument('--csv', dest='generate_csv', action='store_const', const=True, default=False, help='Generate the csv file.') args = parser.parse_args() files = [] try: for i in range(min(2, len(args.files))): files.append(open(args.files[i], 'r')) except IOError: print("Fehler: Datei(en) konnte(n) nicht gelesen werden.") sys.exit(1) policy_chars = "".join([file.read() for file in files]) policy = Policy() try: PolicyBuilder.build(policy_chars, policy) prefix = args.files[-1].rsplit('.', 1)[0] if args.generate_html: html_file = open(prefix + '-reachability.html', 'w') html_file.write(policy.to_html()) if args.generate_csv: csv_file = open(prefix + '-reachability.csv', 'w') csv_file.write(policy.vlans_to_csv()) except PolicyException, exception: print("Fehler: %s" % exception)
def __init__(self): self.no_cells = Hyper.N * Hyper.N #self.results = np.zeros((2, int(Hyper.total_episodes / 100) + 1), dtype=np.int16) self.results = np.zeros((2, Hyper.total_episodes), dtype=np.int16) self.no_episodes = 0 self.setup_display_dict() self.setup_env() self.setup_reward_dict() self.setup_action_dict() self.policy = Policy() self.timesteps_per_episode = [] self.rewards_per_episode = []
def __init__(self, env: [UnityMlFacade], device, seed, verbose=1, gamma=0.99, actor_learning_rate=0.001, critic_learning_rate=0.001, buffer_size=100000, batch_size=100, snapshot_window=5, hidden_layers_comma_sep='400,30'): self.env = env self.device = device self.seed = seed self.verbose = verbose self.gamma = gamma self.buffer_size = buffer_size self.batch_size = batch_size self.snapshot_window = snapshot_window self.policy_snapshots = deque(maxlen=self.snapshot_window) self.current_policy_snapshot = -1 self.last_save = 0 self.last_swap = 0 self.action_size = self.env.action_space.shape[0] * self.env.num_agents self.state_size = self.env.observation_space.shape[0] * self.env.num_agents # this should be 48 hidden_layers = [int(layer_width) for layer_width in hidden_layers_comma_sep.split(',')] # create agent1 self.player_policy = Policy(0, state_size=self.state_size, action_size=self.action_size, hidden_dims=hidden_layers, device=self.device, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, random_seed=seed) # create agent2 self.opponent_policy = Policy(1, state_size=self.state_size, action_size=self.action_size, hidden_dims=hidden_layers, device=self.device, actor_learning_rate=actor_learning_rate, critic_learning_rate=critic_learning_rate, random_seed=seed) self.t_step = 0
def test_policy(): rospy.init_node('controller') con = Controller(None) con.register(TeleControl()) #clf = None clf = Policy(config.TASK) con.register(AutoControl(clf, config.TASK, 'a')) try: con.run() finally: con.pub.publish(Twist()) return clf
def __init__(self, num_actions=3, num_means=2, gamma=0.99): print num_actions, num_means self.basis_function = Basis_Function(num_means, num_means, num_actions, gamma) num_basis = self.basis_function._num_basis() self.policy = Policy(self.basis_function, num_basis) self.lstdq = LSTDQ(self.basis_function, gamma, self.policy) self.stop_criterium = 10**-5 self.gamma = gamma
def policy_iteration(self, pol: Policy): pol = Policy( {s: {a: 1. / len(v) for a in v} for s, v in self.rewards.items()}) v_old = self.get_state_value_func(pol) converge = False while not converge: pol = self.greedy_improved_policy(pol) v_new = self.iterative_policy_evaluation(pol) converge = is_equal(np.linalg.norm(v_new), np.linalg.norm(v_old)) v_old = v_new return pol
def evaluate_agent(env, obs_dim, act_dim, num_episodes): policy = Policy(obs_dim, act_dim, 0.003) policy.restore_weights() scaler = Scaler(obs_dim) print("Restored weights, evaluating...") for i_episode in range(num_episodes): run_episode(env, policy, scaler, 100000, stochastic=True) env.kill()
def __init__(self, policy, mean_model=None, variance_model=None, x_norm=None, u_norm=None, y_norm=None): super(ExpectedDistanceProduction, self).__init__() from policy import Policy self.policy = Policy(7, 3, 4) self.mean = RewardFCPlain(7, 4, 3) self.variance = FCPositive(7, 4, 3) self.x_norm = Normalization(7) self.u_norm = Normalization(4) self.g_norm = Normalization(3) self.register_buffer( 'weights', torch.FloatTensor([1.0, 1.0, 0.1]) )
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ print('Testing Period:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) env.set_goals(0) now = datetime.now().strftime("%b-%d_%H:%M:%S") # create unique directories 格林尼治时间!!! utcnow改为now testname = now+'-'+TestNote logger = Logger(logname=env_name, now=testname) aigym_path = os.path.join('log-Test-files', env_name, testname) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt') episode = 0 observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True) tmp=np.vstack((rewards,states_x,states_y)) tmp1=np.transpose(tmp) data = np.concatenate((observes, actions, tmp1),axis=1) trajectory = {} for j in range(data.shape[0]): for i in range(data.shape[1]): trajectory[i] = data[j][i] logger.log(trajectory) logger.write(display=False) logger.close() policy.close_sess() val_func.close_sess() print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def main(): max_iteration = 5000 episodes_per_batch = 20 max_kl = 0.01 init_logvar = -1 policy_epochs = 5 value_epochs = 10 value_batch_size = 256 gamma = 0.995 lam = .97 # initialize environment env = HumanoidEnv() env.seed(0) obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] logger = Logger() # init qpos and qvel init_qpos = np.load('./mocap_expert_qpos.npy') init_qvel = np.load('./mocap_expert_qvel.npy') # policy function policy = Policy(obs_dim=obs_dim, act_dim=act_dim, max_kl=max_kl, init_logvar=init_logvar, epochs=policy_epochs, logger=logger) session_to_restore = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/model_humanoid_ego_1700' stats_to_recover = '/Users/sayvaz/Desktop/humanoid_gail_results/model_ego_inter/stats_humanoid_ego_1700' scale, offset = policy.restore_session( session_to_restore=session_to_restore, stats_to_recover=stats_to_recover) # expert agent agent = ExpertAgent(env=env, policy_function=policy, scale=scale, offset=offset, init_qpos=init_qpos, init_qvel=init_qvel, logger=logger) agent.collect(episodes_per_batch=20) # close everything policy.close_session()
def test(): ''' This function visualizes the game play. The environment will be reset immediately and the game will not be recorded. To record the game play, please run the record() function. ''' save_path = SAVE_DIR + ENV_NAME + "/" + AUXILIARY_TASK + "/" obs_mean_std = np.load(save_path + "obs_mean_std.npz") obs_mean = obs_mean_std["obs_mean"] obs_std = obs_mean_std["obs_std"] # Create environment. env = make_atari(ENV_NAME) obs_space = env.observation_space action_space = env.action_space # Build models. policy = Policy(obs_space, action_space, is_training=False) with tf.Session() as sess: # Load variables. saver_policy = tf.train.Saver(policy.trainable_variables) saver_policy.restore(sess, save_path + "policy") total_step = 0 total_reward = 0 while True: # Get observation. if total_step == 0: obs = env.reset() else: obs = obs_next obs = (obs - obs_mean) / obs_std env.render() # Get action. action = sess.run( policy.action, feed_dict={policy.Obs: np.reshape(obs, [1, 1, *obs.shape])}) action = np.squeeze(action, (0, 1)) # Interact with the environment. obs_next, reward, done, _ = env.step(action) total_reward += reward if done: # Reset environment. print("Episodic reward: ", total_reward, sep="") obs_next = env.reset() total_reward = 0 # Update step counter. total_step += 1 env.close()