def reset(self): obs = self.env.reset() if self.normalization == 0: obs = self.modified_sigmoid(obs) else: obs = self.normalize_range(obs, self.env.env_range) obs = fourier_basis(obs, order=self.order) return obs
def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.0001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA") agents.append( agent ) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append( copy.deepcopy(agent.learning_algorithm) ) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} ) pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
def step(self, action=None): if action is None: obs, reward, done, _ = self.env.step( self.env.action_space.sample()) else: obs, reward, done, _ = self.env.step(action) if self.normalization == 0: obs = self.modified_sigmoid(obs) else: obs = self.normalize_range(obs, self.env.env_range) obs = fourier_basis(obs, order=self.order) return obs, reward, done, ""
def init_env_animat(setup): env = AnimatEnv(setup) env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) return env
def __init__(self, env, basis_order=2, normalization=0): self.env = env self.env.reset() self.normalization = normalization self.order = basis_order (obs, reward, done, info) = self.env.step(self.env.action_space.sample()) if normalization == 0: obs = EnvWrapper.modified_sigmoid(obs) else: obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=self.order) self.num_features = phi.shape[0] self.num_actions = self.env.action_space.n self.action_space = self.env.action_space
from linear_agent import LinearAgent import time import matplotlib.pyplot as plt import numpy as np import pickle basis_order = 3 alpha = 1e-5 beta = 1e-4 env = AnimatEnv("./CustomEnvironments/maze7.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] # + len( cartpole_setup[0].keys() ) num_actions = env.action_space.n # meta = MetaPolicy(num_features=num_features, num_actions=num_actions) mazes = ["maze5.txt", "maze6.txt", "maze7.txt"] for m in mazes: gym_env = AnimatEnv("./CustomEnvironments/" + m) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=None, alpha=alpha,
def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="REINFORCE") agents.append(agent) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append(copy.deepcopy(agent.learning_algorithm)) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append({ "explore": explore_rewards, "exploit": exploit_rewards }) pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl", "wb"))
def experiment_with_without_actions(meta_path, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(2): meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) if k == 0: prevent_actions = gym_env.action_space.useless_actions else: prevent_actions = None agent = LinearAgent(env, meta_policy=meta, algo="REINFORCE", prevent_actions=prevent_actions) domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: action, explore = t['action'] t_rewards.append(t['reward']) sample_rewards.append(t_rewards) domain_samples.append(sample_rewards) print("Episode %d" % (ep)) domain_rewards_by_episode[ep] = domain_samples filename = "without_actions.pkl" if k == 0 else "with_actions.pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb"))
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 1e-4 beta = 1e-3 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n env = EnvWrapper(env, basis_order=basis_order, normalization=1) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 3 for k in range(2): k = 1 if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} null_action_by_episode = {} for ep in range(episodes): null_actions = {} trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: action, explore = t['action'] a = agents[i][j].env.env.action_space.actions[ action] effect = agents[i][ j].env.env.animat._action_effect(a) if math.fabs(effect[0]) < 0.1 and math.fabs( effect[1]) < 0.1: if action in null_actions: null_actions[action] += 1 else: null_actions[action] = 1 t_rewards.append(t['reward']) sample_rewards.append(sum(t_rewards)) domain_samples.append(sample_rewards) print("Episode %d" % (ep)) domain_rewards_by_episode[ep] = domain_samples null_action_by_episode[ep] = null_actions if ep % 10 == 0: filename = "meta_test_" + str( ep) + ".pkl" if k == 0 else "no_meta_test_" + str( ep) + ".pkl" filename2 = "null_actions_meta_" + str( ep ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str( ep) + ".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb")) pickle.dump(null_action_by_episode, open(save_directory + "/" + filename2, "wb")) for ai, a in enumerate(agents): type_ = "meta_" if k == 0 else "no_meta_" type_ += str(ai) + "_" a[0].learning_algorithm.save_model( save_directory + "/" + type_, ep)
def experiment_random_baseline(save_directory): env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() basis_order = ExperimentsAnimat.RECORDED_DATA[0]['order'] (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 100 num_samples = 1 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] # episodes = d['episodes'] # steps = d['max_steps'] episodes = 600 steps = 600 basis_order = d['order'] domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=None, algo="REINFORCE") domain_agents.append(agent) agents.append(domain_agents) r_maxs.append(max_r) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append(t['reward']) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append(t) sample_rewards.append(t_rewards) domain_samples.append(sample_rewards) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples pickle.dump( domain_rewards_by_episode, open(save_directory + "/trajectory_iter_" + str(k) + ".pkl", "wb"))
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(2): if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") if meta is None: agent.random_action_prob = 0.0 domain_agents.append( agent ) agents.append( domain_agents ) domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d" %(ep)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
def experiment_random_baseline(save_directory): env = gym.make('CartPole-v0') env.reset() basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order'] (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 100 num_samples = 5 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] steps = d['max_steps'] basis_order = d['order'] # alpha = d['alpha'] alpha = 0.0001 domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo="SARSA") domain_agents.append( agent ) agents.append( domain_agents ) r_maxs.append( max_r ) trial_by_domain[i] = [ list() for _ in range(num_samples) ] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d - Trial %d" %(ep, k)) domain_rewards_by_episode[ep] = domain_samples pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
def experiment_train_meta(save_directory, meta_alpha, meta_beta): gym_env = gym.make('CartPole-v0') gym_env.reset() basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order'] (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = gym_env.env.action_space.n env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 500 num_samples = 3 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] steps = d['max_steps'] basis_order = d['order'] # alpha = d['alpha'] alpha = 0.0001 domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") domain_agents.append( agent ) agents.append( domain_agents ) r_maxs.append( max_r ) trial_by_domain[i] = [ list() for _ in range(num_samples) ] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append( t ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d - Trial %d" %(ep, k)) domain_rewards_by_episode[ep] = domain_samples trajectories = [] for key in trial_by_domain.keys(): for traj in trial_by_domain[key]: trajectories.append( traj ) if meta.algo == "REINFORCE": print("Updating meta....") meta.montecarlo_update(trajectories) meta.learning_algorithm.save_model(save_directory+"/", k) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))