def main(): cmd_parser = cmd_parse() options = cmd_parser.parse_args() ## Get the Stock Ticker data ## # print("The Stock ticker used here is ", options.ticker) file = Path("./data/" + options.ticker + ".csv") if file.is_file(): df = pd.read_csv('./data/' + options.ticker + '.csv') df = df.sort_values('Date') print("Loading ticker data from: " + "./data/" + options.ticker + ".csv") else: print( "Data file for ticker does not exist. Please download data first to ./data/" + options.ticker + ".csv") training_logs_path = options.output_file + "_training_logs.csv" eval_logs_path = options.output_file + "_eval_logs" ## Get the training set size ## print("The options.training_set_size is ", options.training_set_size) ## Get the number of look back days ## print("The options.look-back-days here is: ", options.look_back_days) ## Get the model we are using to train the agent ## print("The model to train the agent here is: ", options.model) # The algorithms require a vectorized environment to run env = DummyVecEnv([ lambda: StockTradingEnv(df, options.look_back_days, options. training_set_size, eval_logs_path) ]) if options.model == "PPO2": model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=options.training_set_size) np.savetxt(training_logs_path, model.training_rewards, delimiter=",") obs = env.reset() for i in range(options.training_set_size, len(df['Date'])): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render(title=options.ticker) env.close()
def __init__(self, env_index: int = 0, max_time_step_per_robot=1000, model_path="./cnn", feature_extractor="cnn"): self.env = make_vec_env(DoomEnv, n_envs=1, vec_env_cls=DummyVecEnv, env_kwargs={ "display": False, "feature": feature_extractor, "env_index": env_index }) policy_kwargs = dict(feature_extraction=feature_extractor) self.model = PPO2(PPOPolicy, self.env, policy_kwargs=policy_kwargs, gamma=0.99, n_steps=max_time_step_per_robot, ent_coef=0.01, learning_rate=2.5e-4, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=100, cliprange=0.2, cliprange_vf=None, verbose=1, tensorboard_log="./tensorboard", _init_setup_model=True, full_tensorboard_log=True, seed=None, n_cpu_tf_sess=None) self.model_path = model_path if path.exists(self.model_path): self.model.load(self.model_path, env=self.env)
from envs.WheeledRobotPybulletEnv import WheeledRobotPybulletEnv from stable_baselines.ppo2.ppo2 import PPO2 from stable_baselines.common.vec_env import DummyVecEnv import matplotlib.pyplot as plt raw_env = WheeledRobotPybulletEnv(decision_interval=1, use_GUI=True, num_episode_steps=5) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor vec_env = DummyVecEnv([lambda: raw_env]) dir_name = "results\LearningResults\PPO_WheeledRobotPybullet" tensorboard_dir = dir_name + "\\tensorboard" model_dir = dir_name + "\\model" model = PPO2.load(model_dir, vec_env) # model.learn(total_timesteps=100, tb_log_name="test") # model.save(model_dir) env = vec_env.envs[0] obs_prev = env.reset() x_poss = [env.snake_robot.x] y_poss = [env.snake_robot.y] thetas = [env.snake_robot.theta] times = [0] a1s = [env.snake_robot.a1] a2s = [env.snake_robot.a2] a1dots = [env.snake_robot.a1dot] a2dots = [env.snake_robot.a2dot] # robot_params = []
['A'], ['B'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['left'], ['left', 'A'], ['left', 'B'], ['left', 'A', 'B'], # ['down'], # ['up'] ] _env = gym_super_mario_bros.make('SuperMarioBros-v0') #_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle') env = BinarySpaceToDiscreteSpaceEnv(_env, movements) env = DummyVecEnv([lambda: env]) model = PPO2(policy=CnnPolicy, env=env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() while True: action, _info = model.predict(obs) obs, rewards, dones, info = env.step(action) print("학습끝") print(rewards) env.render()
from stable_baselines.ppo2.ppo2 import PPO2 from stable_baselines.common.vec_env import DummyVecEnv import matplotlib.pyplot as plt raw_env = WheeledRobotPybulletEnv(decision_interval=.5, use_GUI=True) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor vec_env = DummyVecEnv([lambda: raw_env]) # run multi agents # Training dir_name = "results\LearningResults\PPO_WheeledRobotPybullet" tensorboard_dir = dir_name + "\\tensorboard" model_dir = dir_name + "\\model" model = PPO2(MlpPolicy, vec_env, verbose=1, tensorboard_log=tensorboard_dir, full_tensorboard_log=True) model.learn(total_timesteps=20000, tb_log_name="test") # specify model.save(model_dir) # Tensor Board --> cmd --> (DeepRobots) C:\Users\Jesse\Desktop\DeepRobots>tensorboard --logdir results\LearningResults\PPO_WheeledRobotPybullet # Policy Rollout env = vec_env.envs[0] obs_prev = env.reset() x_poss = [env.snake_robot.x] y_poss = [env.snake_robot.y] thetas = [env.snake_robot.theta] times = [0]
def run_ppo(config, state_collector, agent_name="ppo_99_8507750", policy="CnnPolicy", mode="train", task_mode="static", stack_offset=15, num_stacks=1, debug=True, normalize=True, disc_action_space=False, ns=""): path_to_models = config['PATHES']['path_to_models'] # Loading agent model = PPO2.load("%s/%s/%s.pkl" % (path_to_models, agent_name, agent_name)) print("Loaded %s" % agent_name) print("--------------------------------------------------") print("Normalize: ", normalize) print("Policy: %s" % policy) print("Discrete action space: ", disc_action_space) print("Observation space size: ", model.observation_space.shape) print("Debug: ", debug) print("Number of stacks: %d, stack offset: %d" % (model.observation_space.shape[2], stack_offset)) print("\n") #Loading environment env = load_train_env(ns, state_collector, 0.46, 19, num_stacks, stack_offset, debug, task_mode, mode, policy, disc_action_space, normalize) # Resetting environment if mode == "train" or mode == "eval": obs = env.reset() if mode == "exec" or mode == "exec_rw": if disc_action_space: obs, rewards, dones, info = env.step([5]) else: obs, rewards, dones, info = env.step([[0.0, 0.0]]) if debug: #Cummulative reward. cum_reward = 0 while True: #Determining action for given observation action, _states = model.predict(obs) # Clipping actions if not disc_action_space: action = np.maximum(np.minimum(model.action_space.high, action), model.action_space.low) #Executing action in environment obs, rewards, dones, info = env.step(action) if debug: cum_reward += rewards # Episode over? if dones: print("Episode finished with reward of %f." % cum_reward) cum_reward = 0 time.sleep(0.0001) if rospy.is_shutdown(): print('shutdown') break
def load(self, path=None): if path is None: path = self.model_path model = PPO2.load(path, env=self.env)
S = s_Dyymmdd_HHMM """ import datetime date = str(datetime.datetime.now()) date = date[2:4] + date[5:7] + date[8:10] + '_' + date[11:13] + date[14:16] + date[17:19] return s + '_D' + date raw_env = IdealFluidSwimmerWithSpringEnv() # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor vec_env = DummyVecEnv([lambda: raw_env]) results_dir = '../results/PPO_IdealFluidSwimmerWithSpringFixedA1ddot/trial_1.22_D210122_163220' model_path = os.path.join(results_dir, "model") model = PPO2.load(model_path, vec_env) env = vec_env.envs[0] obs_prev = env.reset() x_poss = [env.snake_robot.x] y_poss = [env.snake_robot.y] thetas = [env.snake_robot.theta] times = [0] a1s = [env.snake_robot.a1] a2s = [env.snake_robot.a2] a1dots = [env.snake_robot.a1dot] a2dots = [env.snake_robot.a2dot] a1ddots = [env.snake_robot.a1ddot] ks = [env.snake_robot.k] cs = [env.snake_robot.c]
import gym from envs.IdealFluidSwimmerWithSpringEnv import IdealFluidSwimmerWithSpringEnv from stable_baselines.common.policies import MlpPolicy from stable_baselines.ppo2.ppo2 import PPO2 import matplotlib.pyplot as plt env = IdealFluidSwimmerWithSpringEnv() tensorboard_dir = "results/LearningResults/PPO_IdealFluidSwimmerWithSpring" model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_dir) model.learn(total_timesteps=25, tb_log_name="trial_run") obs_prev = env.reset() x_poss = [env.swimmer.x] y_poss = [env.swimmer.y] thetas = [env.swimmer.theta] times = [0] a1s = [env.swimmer.a1] a2s = [env.swimmer.a2] a1dots = [env.swimmer.a1dot] a2dots = [env.swimmer.a2dot] a1ddots = [env.swimmer.a1ddot] ks = [env.swimmer.k] cs = [env.swimmer.c] # robot_params = [] for i in range(100): x_prev = env.swimmer.x action, _states = model.predict(obs_prev) obs, rewards, dones, info = env.step(action)