def main(): env = None # environment selection if args.lake: env = FrozenLakeEnv(map_name='8x8') if args.size > 0: env = FrozenLakeEnv(desc=None, map_name=None, size=args.size) if args.tower: rings = tuple(range(args.rings - 1, -1, -1)) print(rings) init = (rings, (), ()) goal = ((), (), rings) env = TohEnv(initial_state=init, goal_state=goal, noise=args.noise) print('> env number of states: {}'.format(env.nS)) print('> noise factor: {}'.format(args.noise)) # solver selection discount = args.discount print('> discount factor: {}'.format(discount)) if args.vi: vi_policy = value_iteration(env, discount=discount) policy = vi_policy print_policy(vi_policy) if args.pi: pi_policy = policy_iteration(env, discount=discount) # reshape to 1d array pi_policy = np.reshape(np.argmax(pi_policy, axis=1), [env.nS]) policy = pi_policy print_policy(pi_policy) if args.vi and args.pi: # Compare the two policies diffs = policy_differences(vi_policy, pi_policy) print(diffs) print('VI and PI policy differences: {}'.format(sum(diffs.values()))) if args.q: Q = q_learning(env, total_episodes=args.episodes) # The optimal policy for Q learning is the argmax action with probability 1 - epsilon. q_policy = np.reshape(np.argmax(Q, axis=1), [env.nS]).tolist() policy = q_policy print_policy(q_policy) print('Scoring the policy...') if args.lake: score_frozen_lake(env, policy) if args.tower: score_tower_of_hanoi(env, policy)
def get_environment(ENV_NAME): env_kwargs = { 'map_name': ENV_NAME, 'slip_rate': .2, 'rewards': (-0.01, -1, 1) } env = FrozenLakeEnv(**env_kwargs) env = env.unwrapped return env
from utils import * from methods import * from joblib import Parallel, delayed from MC import * from true_online_GTD import * import numpy.matlib, os from frozen_lake import FrozenLakeEnv unit = 1 env = FrozenLakeEnv(None, '4x4', True, unit) N = env.observation_space.n runtimes = 10 mc_episodes = int(1e7) gamma = lambda x: 0.95 runtime = 0 target_policy = np.matlib.repmat( np.ones((1, env.action_space.n)) / env.action_space.n, env.observation_space.n, 1) true_expectations = np.zeros((runtimes, env.action_space.n**2)) true_variances = np.zeros((runtimes, env.action_space.n**2)) stationary_dists = np.zeros((runtimes, env.action_space.n**2)) cumulative_expectation = np.zeros((1, env.action_space.n**2)) cumulative_variance = np.zeros((1, env.action_space.n**2)) cumulative_distribution = np.zeros((1, env.action_space.n**2)) count = 0 directory = 'frozenlake' filelist = os.listdir(directory) for filename in filelist:
import gym # install this by "pip install gym" import itertools import matplotlib.style import sys import numpy as np import plotting matplotlib.style.use('ggplot') if "../" not in sys.path: sys.path.append("../") from collections import defaultdict from frozen_lake import FrozenLakeEnv env = FrozenLakeEnv() def make_epsilon_greedy_policy(Q, epsilon, num_actions): """ Creates an epsilon-greedy policy based on a given Q-function and epsilon. Returns a function that takes the state as an input and returns the probabilities for each action in the form of a numpy array of length of the action space(set of possible actions). """ def policyFunction(state): Action_probabilities = np.ones(num_actions,
def count_different_entries(a, b): assert a.size == b.size, 'Arrays need to be the same size' return a.size - np.sum(np.isclose(a, b)) if __name__ == '__main__': for ENV_NAME in ENV_NAMES: gamma = 0.9 theta = 0.0001 env_kwargs = { 'map_name': ENV_NAME, 'slip_rate': .2, 'rewards': (-0.1, -1, 1) } print(ENV_NAME) pi_env = FrozenLakeEnv(**env_kwargs) pi_env = pi_env.unwrapped print('policy iteration begin') pi_policy, pi_V, pi_iter, pi_time = policy_iteration(pi_env, discount_factor=gamma, theta=theta) print('policy iteration end') visualize_policy(pi_policy, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Optimal Policy {} Iterations'.format(pi_iter)) visualize_value(pi_V, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Estimated Value of each State') for ENV_NAME in ENV_NAMES: gamma = 0.85 theta = 0.001 env_kwargs = { 'map_name': ENV_NAME, 'slip_rate': .2, 'rewards': (-0.1, -1, 1)
import numpy as np import sys import tensorflow as tf import collections from frozen_lake import FrozenLakeEnv if "../" not in sys.path: sys.path.append("../") from lib.envs.cliff_walking import CliffWalkingEnv from lib import plotting matplotlib.style.use('ggplot') #env = CliffWalkingEnv() #env = gym.make('FrozenLake-v0') env = FrozenLakeEnv(is_slippery=False) class PolicyEstimator(): """ Policy Function approximator. """ def __init__(self, learning_rate=0.001, scope="policy_estimator"): with tf.variable_scope(scope): self.state = tf.placeholder(tf.int32, [], "state") self.action = tf.placeholder(dtype=tf.int32, name="action") self.target = tf.placeholder(dtype=tf.float32, name="target") # This is just table lookup estimator state_one_hot = tf.one_hot(self.state, int(env.observation_space.n)) self.output_layer = tf.contrib.layers.fully_connected(
#!/usr/bin/env python # coding: utf-8 # In[11]: from frozen_lake import FrozenLakeEnv import numpy as np import sys # In[12]: env = FrozenLakeEnv(map_name="4x4", is_slippery=False) # Access the number of states: nS = env.observation_space print("State space of the Env: ", nS) # or you could even use nS = env.nS print("State space of the Env by accessing env.nS: ", nS) # Action space of the agent: nA = env.nA print("Action space of the Env: ", nA) # In[13]: """ For policy iteration, you would need to access State(s), Action(a), Next State(ns), Reward(r), episode ended? (is_done) tuples. Note that in this environment, the orientation of the agent does not matter. No matter what direction the agent is facing, if a left action is performed,
def main(t_expert=1e-2, t_irl=1e-2, gamma=1, h=10, n_traj=200, traj_len=10, learning_rate=0.01, epochs=300): ''' Demonstrates the usage of the implemented MaxCausalEnt IRL algorithm. First a number of expert trajectories is generated using the true reward giving rise to the Boltzmann rational expert policy with temperature t_exp. Hereafter the max_causal_ent_irl() function is used to find a reward vector that maximizes the log likelihood of the generated expert trajectories, modelling the expert as a Boltzmann rational agent with temperature t_irl. Parameters ---------- t_expert : float >= 0 The temperature parameter for computing V, Q and policy of the Boltzmann rational expert: p(a|s) is proportional to exp(Q/t_expert); the closer temperature is to 0 the more rational the expert is. t_irl : float Temperature of the Boltzmann rational policy the IRL algorithm assumes the expert followed when generating the trajectories. gamma : float Discount factor; 0<=gamma<=1. h : int Horizon for the finite horizon version of value iteration subroutine of MaxCausalEnt IRL algorithm. n_traj : int Number of expert trajectories generated. traj_len : int Number of timesteps in each of the expert trajectories. learning_rate : float Learning rate for gradient descent in the MaxCausalEnt IRL algorithm. epochs : int Number of gradient descent steps in the MaxCausalEnt IRL algorithm. ''' np.random.seed(0) mdp = MDPOneTimeR(FrozenLakeEnv(is_slippery=False)) # Features feature_matrix = np.eye(mdp.nS) # Add dummy feature to show that features work if False: feature_matrix = np.concatenate((feature_matrix, np.ones((mdp.nS,1))), axis=1) # The true reward weights and the reward theta_expert = np.zeros(feature_matrix.shape[1]) theta_expert[24] = 1 r_expert = np.dot(feature_matrix, theta_expert) # Compute the Boltzmann rational expert policy from the given true reward. if t_expert>0: V, Q, policy_expert = vi_boltzmann(mdp, gamma, r_expert, h, t_expert) if t_expert==0: V, Q, policy_expert = vi_rational(mdp, gamma, r_expert, h) # Generate expert trajectories using the given expert policy. trajectories = generate_trajectories(mdp, policy_expert, traj_len, n_traj) # Compute and print the stats of the generated expert trajectories. sa_visit_count, _ = compute_s_a_visitations(mdp, gamma, trajectories) log_likelihood = np.sum(sa_visit_count * (Q - V)) print('Generated {} traj of length {}'.format(n_traj, traj_len)) print('Log likelihood of all traj under the policy generated ', 'from the true reward: {}, \n average per traj step: {}'.format( log_likelihood, log_likelihood / (n_traj * traj_len))) print('Average return per expert trajectory: {} \n'.format( np.sum(np.sum(sa_visit_count, axis=1)*r_expert) / n_traj)) # Find a reward vector that maximizes the log likelihood of the generated # expert trajectories. theta = max_causal_ent_irl(mdp, feature_matrix, trajectories, gamma, h, t_irl, epochs, learning_rate) print('Final reward weights: ', theta)
if __name__ == "__main__": args = init_args() skip_render = args.no_render map_name = args.map_name is_slippery = args.slippery gamma = args.gamma tol = args.tol # comment/uncomment these lines to switch between deterministic/stochastic environments # env = gym.make("Deterministic-4x4-FrozenLake-v0") # env = gym.make("Stochastic-4x4-FrozenLake-v0") # using local customized env env = FrozenLakeEnv(map_name=map_name, is_slippery=is_slippery) print("\n" + "-" * 25 + "\nBeginning Policy Iteration\n" + "-" * 25) V_pi, p_pi = policy_iteration(env.P, env.nS, env.nA, gamma=gamma, tol=tol) print('# policy evaluations:', len(policy_eval_iter_count), ' : ', policy_eval_iter_count) print('Optimal policy:') print_policy(env, p_pi, V_pi) if not skip_render: render_single(env, p_pi, 100) print("\n" + "-" * 25 + "\nBeginning Value Iteration\n" + "-" * 25) V_vi, p_vi = value_iteration(env.P, env.nS, env.nA, gamma=gamma, tol=tol) print('# value iteration:', n_value_iter)
received_bits = received_bits + str( send_receive(int(bit), quantum_engine)) received_bytes_list.append(received_bits) binary_to_string = ''.join([chr(int(x, 2)) for x in received_bytes_list]) #print('Received Binary message: ', received_bytes_list) #print('Received message: ', binary_to_string) return binary_to_string quantum_engine = MainEngine() #message = 'DataEspresso' #send_full_message(message=message,quantum_engine=quantum_engine) #env = gym.make('FrozenLake-v0') env = FrozenLakeEnv(is_slippery=False) Q = np.zeros([env.observation_space.n, env.action_space.n]) lr = .8 y = .95 num_episodes = 2000 #jList = [] rList = [] for i in range(num_episodes): #Reset environment and get first new observation s = env.reset() rAll = 0 d = False j = 0 #The Q-Table learning algorithm while j < 99:
#!/usr/bin/env python # coding: utf-8 # In[27]: import numpy as np from frozen_lake import FrozenLakeEnv import random env = FrozenLakeEnv() def epsilon_greedy_action(env, Q, state, epsilon=0.3): n = random.uniform(0, 1) if n <= epsilon: return np.random.randint(env.action_space.n) else: return np.argmax(Q[state]) def Q_Learning(env, episodes=1000, gamma=0.91, alpha=0.1): Q = np.zeros([env.nS, env.nA]) for i in range(episodes): finished = False env.reset() S = env.s while not finished:
def game(N_episodes, AI_type, Intrinsic_type): ############## Hyperparameters ############## env = FrozenLakeEnv() #memory = Memory(max_size=300) ppo = 0 #n_episodes = number_of_episodes #n_actions = env.action_space.n #intrinsic = intrinsic #print(n_actions) #n_agents = 1 #n_episodes = number_of_episodes #state_size = env.observation_space.n #env_name = "LunarLander-v2" # creating environment state_dim = env.observation_space.n action_dim = env.action_space.n render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = N_episodes # max training episodes max_timesteps = 100 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 200 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None samp_rewards = [] avg_rewards = [] best_avg_reward = -np.inf n_agents = 1 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 avg_reward = 0 ppo.memcount.delete() state_size = env.observation_space.n reward_rms = RunningMeanStd() obs_rms = RunningMeanStd() norm_step = 5000 #Pre Run next_obs = [] for norm_step in range(norm_step): action_norm = np.random.randint(0, action_dim) state_norm, reward_norm, done_norm, _ = env.step(action_norm) state_norm = to_categorical(state_norm, state_size) #optional next_obs.append(state_norm) obs_rms.update(next_obs) #print(obs_rms.mean) # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() state = to_categorical(state, state_size) done = False t = 0 episode_reward = 0 intrinsic_rewards = 0 reward = 0 for t in range(max_timesteps): #while not done: timestep += 1 t += 1 # Running policy_old: action = ppo.policy_old.act(state, memory) state, reward, done, _ = env.step(action) state = to_categorical(state, state_size) #======================================================== if ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "1"): intrinsic_rewards = get_intrinsic_rewards( AI_type, state, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards1",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "2"): intrinsic_rewards = get_intrinsic_rewards2( AI_type, state, action, ppo, n_agents, 10) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards2",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "3"): intrinsic_rewards = get_intrinsic_rewards3( AI_type, state, action, ppo, n_agents, reward, 1) intrinsic_rewards = intrinsic_rewards.data.numpy() #print("intrinsic_rewards3",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "4"): intrinsic_rewards = get_intrinsic_rewards4( AI_type, state, action, ppo, n_agents, reward * 10, t, 100, 0.99) #print("intrinsic_rewards---",intrinsic_rewards) elif ((AI_type == "PPO" or AI_type == "A2C") and Intrinsic_type == "5"): intrinsic_rewards = get_intrinsic_rewards5( AI_type, state, ppo, n_agents, 1, 16) #print("intrinsic_rewards5",intrinsic_rewards) else: intrinsic_rewards = 0 #reward_sum = reward + intrinsic_rewards reward_sum = reward #=========================================================== # Saving reward and is_terminal: memory.rewards.append(reward_sum) #temp_int = memory.intrinsic_rewards.data.numpy() #temp_int = memory.intrinsic_rewards #print(temp_int) memory.intrinsic_rewards.append(intrinsic_rewards) memory.is_terminals.append(done) """ try: mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int) reward_rms.update_from_moments(mean1, std1 ** 2, count1) adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var) except: adv_int = 0 """ """ print(temp_int.data.numpy()) mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int) reward_rms.update_from_moments(mean1, std1 ** 2, count1) adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var) """ # update if its time if timestep % update_timestep == 0: temp_int = memory.intrinsic_rewards mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len( temp_int) reward_rms.update_from_moments(mean1, std1**2, count1) adv_int = (temp_int) / np.sqrt(reward_rms.var) ppo.update(memory, adv_int) memory.clear_memory() timestep = 0 running_reward += reward episode_reward += reward if render: env.render() if done: break avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) #break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 samp_rewards.append(episode_reward) if (i_episode >= 100): # get average reward from last 100 episodes avg_reward = np.mean(samp_rewards[-100:]) # append to deque avg_rewards.append(avg_reward) # update best average reward if avg_reward > best_avg_reward: best_avg_reward = avg_reward print("Total reward in episode {} = {}".format(i_episode, episode_reward)) print("Best_avg_reward =", np.round(best_avg_reward, 3), "Average_rewards =", np.round(avg_reward, 3)) #env.save_replay() env.close() return avg_rewards, best_avg_reward, samp_rewards, "0"
plt.title("Epsilon-greedy with decay (epsilon=%.1f, decay=%.3f)" % (epsilon, decay)) plt.xlabel('Episode') plt.legend(loc='best') file_name = '{}/{}/{}_epsilondecay.png'.format(FIGURES_DIRECTORY, ENV_NAME, 'ql') plt.savefig(file_name, format='png', dpi=150) plt.close() if __name__ == '__main__': ENV_NAMES = [FL4x4, FL8x8, FL20x20] for ENV_NAME in ENV_NAMES: env = FrozenLakeEnv( map_name=ENV_NAME, rewards=(-0.01, -1, 1), # living, hole, goal slip_rate=0.2 ) env = env.unwrapped # Tunables method='greedy' n_episodes = 10000 gamma = 0.90 alpha = 0.75 epsilon = 1.0 decay = 0.999 Ne = 10 start = time() q, stats, Nsa, policy = q_learning( env=env, method=method,
import numpy.matlib from frozen_lake import FrozenLakeEnv parser = argparse.ArgumentParser(description='') parser.add_argument('--N', type=int, default=4, help='') parser.add_argument('--alpha', type=float, default=0.05, help='') parser.add_argument('--beta', type=float, default=0.05, help='') parser.add_argument('--kappa', type=float, default=0.01, help='') parser.add_argument('--episodes', type=int, default=int(1e7), help='') parser.add_argument('--runtimes', type=int, default=16, help='') parser.add_argument('--off_policy', type=int, default=0, help='') args = parser.parse_args() unit = 1.0 # experiment Preparation env = FrozenLakeEnv(None, '%dx%d' % (args.N, args.N), True, unit) runtimes, episodes, gamma = args.runtimes, args.episodes, lambda x: 0.95 target_policy = np.matlib.repmat( np.array([0.2, 0.3, 0.3, 0.2]).reshape(1, 4), env.observation_space.n, 1) if args.off_policy == 0: behavior_policy = target_policy else: behavior_policy = np.matlib.repmat( np.array([0.25, 0.25, 0.25, 0.25]).reshape(1, 4), env.observation_space.n, 1) alpha, beta, kappa = args.alpha, args.beta, args.kappa # get ground truth expectation, variance and stationary distribution filename = 'frozenlake_truths_%dx%d.npz' % (args.N, args.N) loaded = np.load(filename)
# -*- coding: utf-8 -*- """ Created on Tue Mar 14 14:59:23 2017 @author: wsn """ from frozen_lake import FrozenLakeEnv env = FrozenLakeEnv() print(env.__doc__) # Some basic imports and setup import numpy as np, numpy.random as nr, gym np.set_printoptions(precision=3) def begin_grading(): print("\x1b[43m") def end_grading(): print("\x1b[0m") # Seed RNGs so you get the same printouts as me env.seed(0); from gym.spaces import prng; prng.seed(10) # Generate the episode env.reset() for t in range(100): env.render() a = env.action_space.sample() ob, rew, done, _ = env.step(a) if done: break assert done env.render(); class MDP(object): def __init__(self, P, nS, nA, desc=None):
# -*- coding: utf-8 -*- """ Created on Wed Mar 15 16:47:07 2017 @author: wsn """ # -*- coding: utf-8 -*- """ Created on Tue Mar 14 14:59:23 2017 @author: wsn """ from frozen_lake import FrozenLakeEnv env = FrozenLakeEnv() print(env.__doc__) # Some basic imports and setup import numpy as np, numpy.random as nr, gym np.set_printoptions(precision=3) def begin_grading(): print("\x1b[43m") def end_grading(): print("\x1b[0m") # Seed RNGs so you get the same printouts as me
#!/usr/bin/env python # coding: utf-8 # In[5]: import numpy as np from frozen_lake import FrozenLakeEnv environment = FrozenLakeEnv() epochs = 1000 if_break = True def Func(alpha, gamma): V = np.zeros(16) for epoch in range(epochs): state = 0 # stan poczatkowy kazdego epizodu if_break = True while if_break: random_action = np.random.randint(4) tupl = environment.P[state][random_action] next_state = tupl[0][1] if next_state == 15: R = 1 else: R = 0 V[state] = (V[state] + alpha * (R + gamma * V[next_state] - V[state]))