def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.0001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA") agents.append( agent ) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append( copy.deepcopy(agent.learning_algorithm) ) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} ) pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] # + len( cartpole_setup[0].keys() ) num_actions = env.action_space.n # meta = MetaPolicy(num_features=num_features, num_actions=num_actions) mazes = ["maze5.txt", "maze6.txt", "maze7.txt"] for m in mazes: gym_env = AnimatEnv("./CustomEnvironments/" + m) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO") agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 dir = "./AnimatPPOEvalNoMeta/" + m.split(".")[0] + "/" # agent.random_action_prob = 0.0 rewards = agent.train(num_episodes=500, max_steps=800, verbose=True, update_meta=False, render=False, save_path=dir)
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, xml_models, episodes, steps): alpha = 1e-3 beta = 1e-2 env = gym.make("RoboschoolInvertedPendulum-v1") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) num_features = obs.shape[0] num_actions = env.action_space.low.shape[0] if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(1): if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for model in xml_models: domain_agents = [] for _ in range(num_samples): env = gym.make("RoboschoolInvertedPendulum-v1") env.env.model_xml = model agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) agent.learning_algorithm.t_length = 16 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 8 domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} for ep in range(episodes): null_actions = {} trajectories_by_domain = ExperimentsInvertedPendulum._run_episode( domain_agents=agents, num_steps=steps, optimize_meta=False) domain_samples = [] for i in trajectories_by_domain.keys(): sample_returns = [] for j in range(len(trajectories_by_domain[i])): sample_returns.append(sum( trajectories_by_domain[i][j])) domain_samples.append(sample_returns) print("Episode %d / %d" % (ep, episodes)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: filename = "meta_test_" + str( ep) + ".pkl" if k == 0 else "no_meta_test_" + str( ep) + ".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb")) for ai, a in enumerate(agents): type_ = "meta_" if k == 0 else "no_meta_" type_ += str(ai) + "_" a[0].learning_algorithm.save_model( save_directory + "/" + type_, ep)
def experiment_train_meta(save_directory, meta_alpha, meta_beta, xml_models): alpha = 1e-3 beta = 1e-2 env = gym.make("RoboschoolInvertedPendulum-v1") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) num_features = obs.shape[0] num_actions = env.action_space.low.shape[0] meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) meta.learning_algorithm.t_length = 32 meta.learning_algorithm.update_steps = 64 if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 30 num_samples = 4 for k in range(meta_train_episodes): agents = [] print("Loading environments...") trial_by_domain = {} for i, model in enumerate(xml_models): domain_agents = [] for _ in range(num_samples): env = gym.make("RoboschoolInvertedPendulum-v1") env.env.model_xml = model agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) domain_agents.append(agent) agent.learning_algorithm.t_length = 16 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 8 agents.append(domain_agents) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") episodes = 250 steps = 500 domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsInvertedPendulum._run_episode( domain_agents=agents, num_steps=steps, r_maxs=None) domain_samples = [] for i in trajectories_by_domain.keys(): sample_returns = [] for j in range(len(trajectories_by_domain[i])): sample_returns.append(sum( trajectories_by_domain[i][j])) domain_samples.append(sample_returns) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: val = (k * episodes) + ep meta.learning_algorithm.save_model(save_directory + "/", val) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump( domain_rewards_by_episode, open( save_directory + "/trajectory_iter_" + str(val) + ".pkl", "wb"))
import time from PIL import Image if __name__ == "__main__": usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("-e", "--environment", action="store", help="environment name", type="string", default="animat") parser.add_option("-m", "--model", action="store", help="env model", type="string") parser.add_option("-a", "--actor", action="store", help="agent actor", type="string") parser.add_option("-c", "--critic", action="store", help="agent critic", type="string") parser.add_option("-s", "--save_path", action="store", help="path for saving vid", type="string", default=None) (options, args) = parser.parse_args() env = options.environment if env != "animat": env = gym.make( env ) env.env.model_xml = options.model else: gym_env = AnimatEnv(options.model) env = EnvWrapper(gym_env, basis_order=3, normalization=1) agent = LinearAgent(env, meta_policy=None, algo="PPO") agent.random_action_prob = 0.0 agent.learning_algorithm.load_model(options.actor, options.critic) agent.play( max_steps=10000, delay=0.01, save_path=options.save_path) time.sleep(0.5)
def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="REINFORCE") agents.append(agent) policies = [] for agent in agents: rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) policies.append(copy.deepcopy(agent.learning_algorithm)) rewards = [] for i, agent in enumerate(agents): agent.learning_algorithm = policies[i] agent.random_action_prob = 0.0 agent.RANDOM_ACTION_DECAY = 1.0 exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) agent.random_action_prob = 1.0 explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) rewards.append({ "explore": explore_rewards, "exploit": exploit_rewards }) pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl", "wb"))
def experiment_with_without_actions(meta_path, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(2): meta = pickle.load(open(meta_path, "rb")) agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) if k == 0: prevent_actions = gym_env.action_space.useless_actions else: prevent_actions = None agent = LinearAgent(env, meta_policy=meta, algo="REINFORCE", prevent_actions=prevent_actions) domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: action, explore = t['action'] t_rewards.append(t['reward']) sample_rewards.append(t_rewards) domain_samples.append(sample_rewards) print("Episode %d" % (ep)) domain_rewards_by_episode[ep] = domain_samples filename = "without_actions.pkl" if k == 0 else "with_actions.pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb"))
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 1e-4 beta = 1e-3 basis_order = 3 env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n env = EnvWrapper(env, basis_order=basis_order, normalization=1) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 3 for k in range(2): k = 1 if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 64 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 domain_agents.append(agent) agents.append(domain_agents) domain_rewards_by_episode = {} null_action_by_episode = {} for ep in range(episodes): null_actions = {} trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: action, explore = t['action'] a = agents[i][j].env.env.action_space.actions[ action] effect = agents[i][ j].env.env.animat._action_effect(a) if math.fabs(effect[0]) < 0.1 and math.fabs( effect[1]) < 0.1: if action in null_actions: null_actions[action] += 1 else: null_actions[action] = 1 t_rewards.append(t['reward']) sample_rewards.append(sum(t_rewards)) domain_samples.append(sample_rewards) print("Episode %d" % (ep)) domain_rewards_by_episode[ep] = domain_samples null_action_by_episode[ep] = null_actions if ep % 10 == 0: filename = "meta_test_" + str( ep) + ".pkl" if k == 0 else "no_meta_test_" + str( ep) + ".pkl" filename2 = "null_actions_meta_" + str( ep ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str( ep) + ".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory + "/" + filename, "wb")) pickle.dump(null_action_by_episode, open(save_directory + "/" + filename2, "wb")) for ai, a in enumerate(agents): type_ = "meta_" if k == 0 else "no_meta_" type_ += str(ai) + "_" a[0].learning_algorithm.save_model( save_directory + "/" + type_, ep)
def experiment_random_baseline(save_directory): env = AnimatEnv("./CustomEnvironments/maze1.txt") env.reset() basis_order = ExperimentsAnimat.RECORDED_DATA[0]['order'] (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.normalize_range(obs, env.env_range) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 100 num_samples = 1 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] # episodes = d['episodes'] # steps = d['max_steps'] episodes = 600 steps = 600 basis_order = d['order'] domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=None, algo="REINFORCE") domain_agents.append(agent) agents.append(domain_agents) r_maxs.append(max_r) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append(t['reward']) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append(t) sample_rewards.append(t_rewards) domain_samples.append(sample_rewards) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples pickle.dump( domain_rewards_by_episode, open(save_directory + "/trajectory_iter_" + str(k) + ".pkl", "wb"))
def experiment_train_meta(save_directory, meta_alpha, meta_beta): alpha = 1e-4 beta = 1e-3 gym_env = AnimatEnv("./CustomEnvironments/maze1.txt") gym_env.reset() basis_order = 0 # ExperimentsAnimat.RECORDED_DATA[0]['order'] (obs, reward, done, info) = gym_env.step(gym_env.action_space.sample()) # obs = EnvWrapper.normalize_range(obs, gym_env.env_range) # phi = fourier_basis(obs, order=basis_order) num_features = obs.shape[0] num_actions = gym_env.action_space.n env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) meta.learning_algorithm.t_length = 32 meta.learning_algorithm.update_steps = 256 if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 30 num_samples = 3 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] # steps = 500 #d['max_steps'] # episodes = 1000 steps = 600 basis_order = d['order'] domain_agents = [] for _ in range(num_samples): gym_env = AnimatEnv(setup) env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1) agent = LinearAgent(env, meta_policy=meta, algo="PPO", alpha=alpha, beta=beta) domain_agents.append(agent) agent.learning_algorithm.t_length = 32 agent.learning_algorithm.update_steps = 128 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 16 agents.append(domain_agents) r_maxs.append(max_r) trial_by_domain[i] = [list() for _ in range(num_samples)] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsAnimat._run_episode( domain_agents=agents, num_steps=steps, r_maxs=r_maxs) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append(t['reward']) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append(t) sample_rewards.append(sum(t_rewards)) domain_samples.append(sample_rewards) print("Episode %d - Trial %d" % (ep, k)) domain_rewards_by_episode[ep] = domain_samples if ep % 10 == 0: val = (k * episodes) + ep meta.learning_algorithm.save_model(save_directory + "/", val) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump( domain_rewards_by_episode, open( save_directory + "/trajectory_iter_" + str(val) + ".pkl", "wb")) trajectories = [] for key in trial_by_domain.keys(): for traj in trial_by_domain[key]: trajectories.append(traj) if meta.algo == "REINFORCE": print("Updating meta....") meta.montecarlo_update(trajectories)
def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps): alpha = 0.001 basis_order = 3 env = gym.make('CartPole-v0') env.reset() (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) num_samples = 5 for k in range(2): if k == 0: meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env) meta.learning_algorithm.load_model(meta_actor, meta_critic) else: meta = None agents = [] for setup in setups: domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") if meta is None: agent.random_action_prob = 0.0 domain_agents.append( agent ) agents.append( domain_agents ) domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d" %(ep)) domain_rewards_by_episode[ep] = domain_samples if ep % 100 == 0: filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl" pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
def experiment_random_baseline(save_directory): env = gym.make('CartPole-v0') env.reset() basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order'] (obs, reward, done, info) = env.step(env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = env.action_space.n if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 100 num_samples = 5 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] steps = d['max_steps'] basis_order = d['order'] # alpha = d['alpha'] alpha = 0.0001 domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo="SARSA") domain_agents.append( agent ) agents.append( domain_agents ) r_maxs.append( max_r ) trial_by_domain[i] = [ list() for _ in range(num_samples) ] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d - Trial %d" %(ep, k)) domain_rewards_by_episode[ep] = domain_samples pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
def experiment_train_meta(save_directory, meta_alpha, meta_beta): gym_env = gym.make('CartPole-v0') gym_env.reset() basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order'] (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample()) obs = EnvWrapper.modified_sigmoid(obs) phi = fourier_basis(obs, order=basis_order) num_features = phi.shape[0] num_actions = gym_env.env.action_space.n env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env) if os.path.isdir(save_directory) == False: os.mkdir(save_directory) meta_train_episodes = 500 num_samples = 3 for k in range(meta_train_episodes): agents = [] print("Loading environments...") r_maxs = [] trial_by_domain = {} for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA): print("Setup: " + str(d['setup'])) setup = d['setup'] max_r = d['max_r'] episodes = d['episodes'] steps = d['max_steps'] basis_order = d['order'] # alpha = d['alpha'] alpha = 0.0001 domain_agents = [] for _ in range(num_samples): gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO") domain_agents.append( agent ) agents.append( domain_agents ) r_maxs.append( max_r ) trial_by_domain[i] = [ list() for _ in range(num_samples) ] print("Done loading...") domain_rewards_by_episode = {} for ep in range(episodes): trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs) domain_samples = [] for i in trajectories_by_domain.keys(): sample_rewards = [] for j in range(len(trajectories_by_domain[i])): t_rewards = [] for t in trajectories_by_domain[i][j]: t_rewards.append( t['reward'] ) t['reward'] = t['reward'] / r_maxs[i] trial_by_domain[i][j].append( t ) sample_rewards.append( t_rewards ) domain_samples.append( sample_rewards ) print("Episode %d - Trial %d" %(ep, k)) domain_rewards_by_episode[ep] = domain_samples trajectories = [] for key in trial_by_domain.keys(): for traj in trial_by_domain[key]: trajectories.append( traj ) if meta.algo == "REINFORCE": print("Updating meta....") meta.montecarlo_update(trajectories) meta.learning_algorithm.save_model(save_directory+"/", k) # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb")) pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
import numpy as np import gym basis_order = 1 alpha = 1e-2 beta = 1e-2 setup = {"force" : 20.0, "pole_length" : 1.2, "masscart" : 5.0, "masspole" : 0.1} gym_env = gym.make('CartPole-v0') gym_env.env.force_mag = setup["force"] gym_env.env.length = setup["pole_length"] gym_env.env.masscart = setup["masscart"] gym_env.env.masspole = setup["masspole"] env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0) agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO") agent.learning_algorithm.t_length = 8 agent.learning_algorithm.update_steps = 16 agent.learning_algorithm.epochs = 4 agent.learning_algorithm.batch_size = 8 rewards = agent.train(num_episodes=500, max_steps=1000, verbose=True, update_meta=False, render=False) rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)//10) ] plt.plot(range(len(rewards)), rewards) plt.show(block=True) # for _ in range(10000): # reward, done, update_info = agent.perform_step(update_meta=False) # env.render() # time.sleep(1.0)
args = namedtuple("parser", d_args.keys())(*d_args.values()) args.out # create the directory here and tensorboard writer here os.makedirs(args.out, exist_ok=True) # create the tensorboard summary writer here tb_log_dir = os.path.join(args.log_dir, "2DGrid", name, 'tb_logs') #Susan added this line csv_log_dir = os.path.join(args.log_dir, "2DGrid", name, 'csv_logs') print("Log dir", tb_log_dir) print("Out dir", args.out) #Susan added this line print("csv log dir", csv_log_dir) if args.reset_dir: shutil.rmtree(tb_log_dir, ignore_errors=True) #Susan added this line shutil.rmtree(csv_log_dir, ignore_errors=True) os.makedirs(tb_log_dir, exist_ok=True) #Susan added this line os.makedirs(csv_log_dir, exist_ok=True) tb_writer = SummaryWriter(log_dir=tb_log_dir) agent = LinearAgent(args, env, action_noise, featurize_state, featurize_action, tb_log_dir, csv_log_dir) # run the agent here agent.run()
}] else: print("Unrecognized environment: " + env_name) assert (False) if os.path.isdir(save_dir) == False: os.mkdir(save_dir) data = [] for setup in setups: if env_name.lower() == "animat": env = init_env_animat(setup) elif env_name.lower() == "cartpole": env = init_env_cartpole(setup) agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo=algo) rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False) setup_data = { 'setup': setup, 'max_r': max(rewards), 'episodes': episodes, 'max_steps': steps, 'alpha': alpha, 'order': basis_order, 'algo': algo