def td3(env, hyper, policy = "MlpPolicy", 
        verbose = 0, tensorboard_log = None, seed = 0, 
        use_sde = True, learning_starts = 100, device = "auto"):
 
  policy_kwargs = make_policy_kwargs(hyper, "td3")
  hyper = action_noise(hyper, "td3", n_actions = env.action_space.shape[0])
  
  #optimize_memory_usage=False, policy_delay=2, target_policy_noise=0.2, target_noise_clip=0.5, 
  
  model = TD3('MlpPolicy', env, 
              verbose = verbose, 
              tensorboard_log = tensorboard_log, 
              seed = seed,
              gamma = hyper['params_gamma'],
              learning_rate = hyper['params_lr'],
              batch_size = np.int(hyper['params_batch_size']),            
              buffer_size = np.int(hyper['params_buffer_size']),
              action_noise = hyper['params_action_noise'],
              train_freq = np.int(hyper['params_train_freq']),
              gradient_steps = np.int(hyper['params_train_freq']),
              n_episodes_rollout = np.int(hyper['params_n_episodes_rollout']),
              learning_starts = learning_starts,
              policy_kwargs=policy_kwargs,
              device = device)
  return model
示例#2
0
def train():
    best_reward, best_reward_timesteps = None, None
    save_path = "model_save/"+MODEL_PATH+"/"
    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)

    # log_dir = f"model_save/"
    log_dir = save_path
    env, env_eval = ENV(util='train', par=PARAM, dt=DT), ENV(util='val', par=PARAM, dt=DT)
    env, env_eval = Monitor(env, log_dir), Monitor(env_eval, log_dir)
    env, env_eval = DummyVecEnv([lambda: env]), DummyVecEnv([lambda: env_eval])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    if PARAM['algo']=='td3':
        model = TD3('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                    learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ddpg':
        model = DDPG('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'],
                     learning_starts=PARAM['learning_starts'])
    elif PARAM['algo']=='ppo':
        model = PPO('MlpPolicy', env, verbose=1, batch_size=PARAM['batch_size'], seed=PARAM['seed'])

    eval_callback = EvalCallback(env_eval, best_model_save_path=save_path+MODEL_PATH+'_best_model',
                                 log_path=log_dir, eval_freq=PARAM['eval_freq'], save_freq=PARAM['save_freq'],
                                 deterministic=True, render=False)

    model.learn(total_timesteps=int(PARAM['total_time_step']), callback=eval_callback, log_interval = 500)
    print("best mean reward:", eval_callback.best_mean_reward_overall, "timesteps:", eval_callback.best_mean_reward_timestep)
    model.save(save_path+MODEL_PATH+'_final_timesteps')
示例#3
0
    def __init__(self,
                 policy: Union[str, Type[TD3Policy]],
                 env: str,
                 mapper: Callable[[Tensor], Union[Tensor, np.ndarray,
                                                  list]] = None,
                 verbose: bool = True,
                 tensorboard_log: str = "log/") -> None:
        vecEnv = self._get_env(env, mapper)

        self._env = vecEnv
        self._policy = policy
        self._ddpg = TD3(self._policy,
                         self._env,
                         learning_rate=LEARNING_RATE,
                         buffer_size=BUFFER_SIZE,
                         learning_starts=LEARNING_STARTS,
                         batch_size=BATCH_SIZE,
                         tau=TAU,
                         gamma=GAMMA,
                         policy_delay=POLICY_DELAY,
                         train_freq=(N_EPISODES_ROLLOUT, 'episode'),
                         policy_kwargs={"agent_num": vecEnv.agent_num()},
                         verbose=verbose,
                         tensorboard_log=tensorboard_log)
        self._ddpg.replay_buffer = MultiAgentReplayBuffer(
            buffer_size=BUFFER_SIZE,
            observation_space=vecEnv.observation_space,
            action_space=vecEnv.action_space,
            device=self._ddpg.replay_buffer.device,
            n_envs=len(vecEnv.envs),
            n_agent=vecEnv.agent_num(),
            optimize_memory_usage=self._ddpg.replay_buffer.
            optimize_memory_usage)
示例#4
0
 def create_model(env, algorithm, save_path):
     # the noise object
     n_actions = env.action_space.shape[-1]
     action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                 sigma=float(0.2) *
                                                 np.ones(n_actions),
                                                 theta=0.15)
     if algorithm == "ddpg":
         return DDPG(DDPG_MlpPolicy,
                     env,
                     learning_rate=0.001,
                     buffer_size=1000000,
                     batch_size=64,
                     tau=0.001,
                     gamma=0.99,
                     train_freq=(10, "step"),
                     action_noise=action_noise,
                     policy_kwargs=dict(optimizer_class=th.optim.AdamW),
                     tensorboard_log=save_path)
     elif algorithm == "td3":
         return TD3(TD3_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     elif algorithm == "sac":
         return SAC(SAC_MlpPolicy,
                    env,
                    action_noise=action_noise,
                    tensorboard_log=save_path)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
示例#5
0
    def train_TD3(self, model_name, model_params = config.TD3_PARAMS):
        """TD3 model"""
        from stable_baselines3 import TD3
        from stable_baselines3.td3.policies import MlpPolicy
        from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

        env_train = self.env

        n_actions = env_train.action_space.shape[-1]
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1*np.ones(n_actions))

        start = time.time()
        model = TD3('MlpPolicy', env_train,
                    batch_size=model_params['batch_size'],
                    buffer_size=model_params['buffer_size'],
                    learning_rate = model_params['learning_rate'],
                    action_noise = action_noise,
                    verbose=model_params['verbose'],
                    tensorboard_log = f"{config.TENSORBOARD_LOG_DIR}/{model_name}"
                    )
        model.learn(total_timesteps=model_params['timesteps'], tb_log_name = "TD3_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (DDPG): ', (end-start)/60,' minutes')
        return model
示例#6
0
 def load_model(env, algorithm, filename):
     if algorithm == "ddpg":
         return DDPG.load(filename, env=env)
     elif algorithm == "td3":
         return TD3.load(filename, env=env)
     elif algorithm == "sac":
         return SAC.load(filename, env=env)
     else:
         raise Exception("--> Alican's LOG: Unknown agent type!")
示例#7
0
def test_td3(action_noise):
    model = TD3('MlpPolicy',
                'Pendulum-v0',
                policy_kwargs=dict(net_arch=[64, 64]),
                learning_starts=100,
                verbose=1,
                create_eval_env=True,
                action_noise=action_noise)
    model.learn(total_timesteps=1000, eval_freq=500)
示例#8
0
def test_save_load_large_model(tmp_path):
    """
    Test saving and loading a model with a large policy that is greater than 2GB. We
    test only one algorithm since all algorithms share the same code for loading and
    saving the model.
    """
    env = select_env(TD3)
    kwargs = dict(policy_kwargs=dict(net_arch=[8192, 8192, 8192]),
                  device="cpu")
    model = TD3("MlpPolicy", env, **kwargs)

    # test saving
    model.save(tmp_path / "test_save")

    # test loading
    model = TD3.load(str(tmp_path / "test_save.zip"), env=env, **kwargs)

    # clear file from os
    os.remove(tmp_path / "test_save.zip")
示例#9
0
def train_td3():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)
    model = TD3("CnnPolicy", env, policy_kwargs=policy_kwargs, verbose=1, batch_size=2048, seed=1, learning_starts=500000)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir)
    model.learn(total_timesteps=int(1000000), callback = callback, log_interval = 480)
    model.save('model_save/td3_cnn')
    def __init__(self, env, hyperparameters=DEFAULT_HYPERPARAMETERS):
        self.P = hyperparameters

        if self.P["model_class"] == "dqn":
            from stable_baselines3 import DQN
            self.model = DQN('MlpPolicy', env, verbose=self.P["verbose"])
            self.model_class = DQN

        elif self.P["model_class"] == "a2c":
            from stable_baselines3 import A2C
            from stable_baselines3.a2c import MlpPolicy
            self.model = A2C(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = A2C

        elif self.P["model_class"] == "ddpg":
            from stable_baselines3 import DDPG
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = DDPG('MlpPolicy',
                              env,
                              action_noise=action_noise,
                              verbose=self.P["verbose"])
            self.model_class = DDPG

        elif self.P["model_class"] == "td3":
            from stable_baselines3 import TD3
            from stable_baselines3.td3.policies import MlpPolicy
            from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
            n_actions = env.action_space.shape[-1]
            action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                             sigma=0.1 * np.ones(n_actions))
            self.model = TD3(MlpPolicy,
                             env,
                             action_noise=action_noise,
                             verbose=self.P["verbose"])
            self.model_class = TD3

        elif self.P["model_class"] == "ppo":
            from stable_baselines3 import PPO
            from stable_baselines3.ppo import MlpPolicy
            self.model = PPO(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = PPO

        elif self.P["model_class"] == "sac":
            from stable_baselines3 import SAC
            from stable_baselines3.sac import MlpPolicy
            self.model = SAC(MlpPolicy, env, verbose=self.P["verbose"])
            self.model_class = SAC

        else:
            raise NotImplementedError()
示例#11
0
def test(MODEL_TEST):
    log_dir = "model_save/" + MODEL_PATH + "/" + MODEL_PATH + MODEL_TEST

    env = ENV(util='test', par=PARAM, dt=DT)
    env.render = True
    env = Monitor(env, log_dir)

    if PARAM['algo']=='td3':
        model = TD3.load(log_dir)
    elif PARAM['algo']=='ddpg':
        model = DDPG.load(log_dir)
    elif PARAM['algo']=='ppo':
        model = PPO.load(log_dir)

    # plot_results(f"model_save/")
    trade_dt = pd.DataFrame([])     # trade_dt: 所有股票的交易数据
    result_dt = pd.DataFrame([])    # result_dt: 所有股票一年测试结果数据
    for i in range(TEST_STOCK_NUM):
        state = env.reset()
        stock_bh_id = 'stock_bh_'+str(i)            # 记录每个股票交易的buy_hold
        stock_port_id = 'stock_port_'+str(i)        # 记录每个股票交易的portfolio
        stock_action_id = 'stock_action_' + str(i)  # 记录每个股票交易的action
        flow_L_id = 'stock_flow_' + str(i)          # 记录每个股票的流水
        stock_bh_dt, stock_port_dt, action_policy_dt, flow_L_dt = [], [], [], []
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)   # 测试每一步的交易policy
            stock_bh_dt.append(env.buy_hold)
            stock_port_dt.append(env.Portfolio_unit)
            action_policy_dt.append(action[0][0])  # 用于记录policy
            flow_L_dt.append(env.flow)
            day+=1
            if done:
                print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                      .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad))
                # 交易完后记录:股票ID,利润(单位100%),buy_hold(单位100%),夏普率,最大回撤率(单位100%),romad
                result=pd.DataFrame([[i,env.profit*100,env.buy_hold*100,env.sp,env.mdd*100,env.romad]])
                break

        trade_dt_stock = pd.DataFrame({stock_port_id: stock_port_dt,
                                       stock_bh_id: stock_bh_dt,
                                       stock_action_id: action_policy_dt,
                                       flow_L_id: flow_L_dt})  # 支股票的交易数据

        trade_dt = pd.concat([trade_dt, trade_dt_stock], axis=1)    # 所有股票交易数据合并(加行)
        result_dt = pd.concat([result_dt,result],axis=0)            # 所有股票结果数据合并(加列)

    result_dt.columns = ['stock_id','prfit(100%)','buy_hold(100%)','sp','mdd(100%)','romad']
    trade_dt.to_csv('out_dt/trade_'+MODEL_PATH+'.csv',index=False)
    result_dt.to_csv('out_dt/result_'+MODEL_PATH+'.csv',index=False)
示例#12
0
def main():
    args = parse_arguments()
    load_path = os.path.join("logs", args.env, args.agent, "best_model.zip")
    stats_path = os.path.join(args.log_dir, args.env, args.agent, "vec_normalize.pkl")

    if args.agent == 'ddpg':
        from stable_baselines3 import DDPG
        model = DDPG.load(load_path)
    elif args.agent == 'td3':
        from stable_baselines3 import TD3
        model = TD3.load(load_path)
    elif args.agent == 'ppo':
        from stable_baselines3 import PPO
        model = PPO.load(load_path)

    env = make_vec_env(args.env, n_envs=1)
    env = VecNormalize.load(stats_path, env)
    #  do not update them at test time
    env.training = False
    # reward normalization is not needed at test time
    env.norm_reward = False
    
    # env = gym.make(args.env)
    img = []
    if args.render:
        env.render('human')
    done = False
    obs = env.reset()
    action = model.predict(obs)
    if args.gif:
        img.append(env.render('rgb_array'))

    if args.timesteps is None:
        while not done: 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()
    else:
        for i in range(args.timesteps): 
            action, _= model.predict(obs)
            obs, reward, done, info = env.step(action)
            if args.gif:
                img.append(env.render('rgb_array'))
            else:
                env.render()

    if args.gif:
        imageio.mimsave(f'{os.path.join("logs", args.env, args.agent, "recording.gif")}', [np.array(img) for i, img in enumerate(img) if i%2 == 0], fps=29)
示例#13
0
def play():
    model = TD3.load("models/kuka_iiwa_insertion-v0")

    env = gym.make('kuka_iiwa_insertion-v0', use_gui=True)

    obs = env.reset()
    i = 0
    while True:
        i += 1
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        if i % 100 == 0 or dones: 
            print(obs, rewards, dones, info)
        if dones:
            print("="*20 + " RESET " + "="*20)
            env.reset()
示例#14
0
    def create(self, n_envs=1):
        """Create the agent"""
        self.env = self.agent_helper.env
        log_dir = self.agent_helper.config_dir
        os.makedirs(log_dir, exist_ok=True)
        self.env = Monitor(self.env, log_dir)
        #TODO: 
        # Create DDPG policy and define its hyper parameter here! even the action space and observation space.
        # add policy
        policy_name = self.agent_helper.config['policy']
        self.policy = eval(policy_name)
        # action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
        n_actions = int(self.agent_helper.env.action_space.shape[0])
        action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=self.agent_helper.config['rand_sigma'] * np.ones(n_actions))
        
        #FIXME: test:
        # self.model = DDPG("MlpPolicy", self.env, action_noise=action_noise, verbose=1, tensorboard_log=self.agent_helper.graph_path)

        # TODO: fix the obvervation space and action space later. Test if the obervation space input is correct? Output action space is correct?
        # activ_function_name = self.agent_helper.config['nn_activ']
        # activ_function = eval(activ_function_name)

        # policy_kwargs = dict(activation_fn=activ_function,
        #              net_arch=[dict(pi=[32, 32], qf=[32, 32])])
        policy_kwargs = dict(net_arch=self.agent_helper.config['layers'])
        self.model = TD3(
            self.policy,
            self.env,
            learning_rate=self.agent_helper.config['learning_rate'],
            buffer_size = self.agent_helper.config['buffer_size'],
            batch_size=self.agent_helper.config['batch_size'],
            tau=self.agent_helper.config['tau'],
            gamma=self.agent_helper.config['gamma'],
            gradient_steps=self.agent_helper.config['gradient_steps'],
            action_noise=action_noise,
            optimize_memory_usage=self.agent_helper.config['optimize_memory_usage'],
            create_eval_env=self.agent_helper.config['create_eval_env'],
            policy_kwargs=policy_kwargs,
            verbose=self.agent_helper.config['verbose'],
            learning_starts=self.agent_helper.config['learning_starts'],
            tensorboard_log=self.agent_helper.graph_path,
            policy_delay = self.agent_helper.config['policy_delay'],
            target_policy_noise= self.agent_helper.config['target_policy_noise'],
            target_noise_clip= self.agent_helper.config['target_noise_clip'],
            seed=self.agent_helper.seed
        )
        pass
示例#15
0
def train_td3():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
    # model = TD3("MlpPolicy", env, action_noise=action_noise, verbose=1, batch_size=2048, seed=1)

    model = TD3('MlpPolicy', env, verbose=1, batch_size=2048, seed=1, learning_starts=1440)
    callback = SaveOnBestTrainingRewardCallback(check_freq=480, log_dir=log_dir)
    model.learn(total_timesteps=int(2880), callback = callback, log_interval = 100)
    model.save('model_save/td3_sp2')
示例#16
0
def main():
    n_envs = 8
    env_id = "CartPole-v0"
    # def env_fn():
    #     return continuous_actions(gym.make(env_id))
    env = env_fn()
    #print(env.observation_space)
    #obs_size, = env.observation_space.shape
    #act_size = env.action_space.n

    sb3_env = SpaceWrap(env)

    # print(sb3_env.action_space)
    # exit(0)
    n_timesteps = 1000
    save_path = "log"
    eval_freq = 50

    tensorboard_log = ""
    sb3_learner_fn = lambda device: TD3(env=sb3_env,
                                        tensorboard_log=tensorboard_log,
                                        policy=MlpPolicy,
                                        device=device)
    learner_fn = lambda: SB3LearnWrapper(sb3_learner_fn("cuda"))

    policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cuda").policy)
    example_policy_fn = lambda: SB3Wrapper(sb3_learner_fn("cpu").policy)
    #learner = (model)
    learn_rate = lambda x: 0.01
    #policy = SB3Wrapper(model.policy)#MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu"))
    data_store_size = 12800
    batch_size = 512
    logger = make_logger("log")
    run_loop(
        logger,
        learner_fn,  #A2CLearner(policy, 0.001, 0.99, logger, device),
        OccasionalUpdate(10, example_policy_fn()),
        lambda: StatelessActor(policy_fn()),
        env_fn,
        MakeCPUAsyncConstructor(4),
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5)
示例#17
0
def test_td3():
    log_dir = f"model_save/best_model_td3_cnn"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = TD3.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",i,"action:", action,"now profit:",env.profit)
            if done:
                print('stock',i,' total profit=',env.profit,' buy hold=',env.buy_hold)
                break
示例#18
0
def test_td3_train_with_batch_norm():
    model = TD3(
        "MlpPolicy",
        "Pendulum-v0",
        policy_kwargs=dict(net_arch=[16, 16], features_extractor_class=FlattenBatchNormDropoutExtractor),
        learning_starts=0,
        tau=0,  # do not copy the target
        seed=1,
    )

    (
        actor_bias_before,
        actor_running_mean_before,
        critic_bias_before,
        critic_running_mean_before,
        actor_target_bias_before,
        actor_target_running_mean_before,
        critic_target_bias_before,
        critic_target_running_mean_before,
    ) = clone_td3_batch_norm_stats(model)

    model.learn(total_timesteps=200)

    (
        actor_bias_after,
        actor_running_mean_after,
        critic_bias_after,
        critic_running_mean_after,
        actor_target_bias_after,
        actor_target_running_mean_after,
        critic_target_bias_after,
        critic_target_running_mean_after,
    ) = clone_td3_batch_norm_stats(model)

    assert ~th.isclose(actor_bias_before, actor_bias_after).all()
    assert ~th.isclose(actor_running_mean_before, actor_running_mean_after).all()

    assert ~th.isclose(critic_bias_before, critic_bias_after).all()
    assert ~th.isclose(critic_running_mean_before, critic_running_mean_after).all()

    assert th.isclose(actor_target_bias_before, actor_target_bias_after).all()
    assert th.isclose(actor_target_running_mean_before, actor_target_running_mean_after).all()

    assert th.isclose(critic_target_bias_before, critic_target_bias_after).all()
    assert th.isclose(critic_target_running_mean_before, critic_target_running_mean_after).all()
示例#19
0
    def prepare_stage(self):

        dir = f'experiments/{self.config.experiment_name}'
        if not os.path.exists(dir):
            os.mkdir(dir)

        else:

            # recovers the latest non-corrupted checkpoint, if existent

            checkpoints = []
            for file in glob.glob(f'{dir}/status_checkpoint*'):
                checkpoints.append(
                    int(file.split('/status_checkpoint_')[1].split('.')[0]))
                checkpoints.sort()

            attempts = len(checkpoints) - 1

            while attempts >= 0:
                try:
                    f = open(
                        f'{dir}/status_checkpoint_{checkpoints[attempts]}.pkl',
                        'rb')
                    self.results_episodes, self.results_episodes_validation, self.current_checkpoint, self.current_episode = pickle.load(
                        f)

                    # only recovers pickle if model also available
                    env2 = DummyVecEnv([lambda: self.env])
                    self.model = TD3.load(
                        f'{dir}/model_checkpoint_{checkpoints[attempts]}',
                        env=env2)

                    attempts = -1

                    self.log.write(
                        f'RECOVERED checkpoint {checkpoints[attempts]}')

                except:
                    self.log.write(
                        f'ERROR: Could not recover checkpoint {checkpoints[attempts]}  {traceback.format_exc()}'
                    )
                    self.results_episodes, self.results_episodes_validation, self.current_checkpoint, self.current_episode = [], [], 0, 0

                attempts -= 1
示例#20
0
def main():
    n_envs = 8
    env_id = "CartPole-v0"
    # def env_fn():
    #     return continuous_actions(gym.make(env_id))
    env = env_fn()
    #print(env.observation_space)
    #obs_size, = env.observation_space.shape
    #act_size = env.action_space.n

    sb3_env = SpaceWrap(env)

    # print(sb3_env.action_space)
    # exit(0)
    n_timesteps = 1000
    save_path = "log"
    eval_freq = 50

    tensorboard_log = ""

    model = TD3(env=sb3_env, tensorboard_log=tensorboard_log, policy=MlpPolicy)
    learner = SB3LearnWrapper(model)
    device = "cpu"
    learn_rate = lambda x: 0.01
    policy = SB3Wrapper(
        model.policy
    )  #MlpPolicy(env.observation_space, env.action_space, learn_rate, device="cpu"))
    data_store_size = 12800
    batch_size = 16
    logger = make_logger("log")
    run_loop(
        logger,
        lambda: learner,  #A2CLearner(policy, 0.001, 0.99, logger, device),
        NoUpdate(),  #.10, policy),
        lambda: StatelessActor(policy),
        env_fn,
        ConcatVecEnv,
        lambda: TransitionAdder(env.observation_space, env.action_space),
        UniformSampleScheme(data_store_size),
        data_store_size,
        batch_size,
        n_envs=16,
        log_frequency=5)
示例#21
0
    def __init__(self, model='a2c', use_gp=False, gp_params=None, **kwargs):
        # wrapper around stable_baselines RL implemenetations
        assert model in ACCEPTED_MODELS, 'Unknown RL model, must be in {}'.format(ACCEPTED_MODELS)
        if model == 'a2c':
            self.rl = A2C(**kwargs)
        elif model == 'ppo':
            self.rl = PPO(**kwargs)
        elif model == 'dqn':
            self.rl = DQN(**kwargs)
        elif model == 'td3':
            self.rl = TD3(**kwargs)

        self.use_gp = use_gp
        if self.use_gp:
            assert gp_params is not None, 'Must provide parameters such as training data, number of iterations, etc. for GPR'
            self.n_train = gp_params['n_train']
            self.retraining_iter = gp_params['training_iter']
            self.cvar_limit = gp_params['cvar_limit']
            self.gp_limit = gp_params['gp_limit']

            self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
            if 'data' in gp_params.keys():
                self.X_train = gp_params['data']['X_train']
                self.y_train = gp_params['data']['y_train']
            else:
                self.X_train = torch.zeros(self.n_train, kwargs['env'].num_features) # hard coded to match dimensions of features
                self.y_train = torch.zeros(self.n_train)
            self.gp = ExactGPModel(self.X_train, self.y_train, self.likelihood)
            self.mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.gp)
            self.opt = torch.optim.Adam(self.gp.parameters(), lr=0.1)

            self.shares = 0
            self.cash = 0
            self.obs = [] # holds up to 2 past observations, helps in keeping X, y aligned

            # for plotting
            self.pred_return = 0
            self.pred_lower = 0
            self.pred_upper = 0

            # for debugging
            self.goal_num_shares = 0
示例#22
0
def run(env, algname, filename):
    if algname == "TD3":
        model = TD3.load(f"{algname}_pkl")
    elif algname == "SAC":
        if filename:
            model = SAC.load(f"{filename}")
        else:
            model = SAC.load(f"{algname}_pkl")
    elif algname == "DDPG":
        model = DDPG.load(f"{algname}_pkl")
    else:
        raise "Wrong algorithm name provided."

    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
        if done:
            break
示例#23
0
def train_TD3(env):

    print(f"action space shape -1:{env.action_space.shape[-1]}")

    # The noise objects for TD3
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.02 * np.ones(n_actions))

    model = TD3(MlpPolicy,
                env,
                learning_rate=0.0003,
                buffer_size=100000,
                action_noise=action_noise,
                batch_size=128,
                learning_starts=128,
                verbose=1)
    model.learn(total_timesteps=2000000, log_interval=10)

    model.save("TD3_pkl")
示例#24
0
def test_td3():
    log_dir = f"model_save/best_model_td3_sp2"
    env = ENV(istest=True)
    env.render = True
    env = Monitor(env, log_dir)
    model = TD3.load(log_dir)
    plot_results(f"model_save/")
    for i in range(10):
        state = env.reset()
        day = 0
        while True:
            action = model.predict(state)
            next_state, reward, done, info = env.step(action[0])
            state = next_state
            # print("trying:",day,"reward:", reward,"now profit:",env.profit)
            day+=1
            if done:
                print('stock: {}, total profit: {:.2f}%, buy hold: {:.2f}%, sp: {:.4f}, mdd: {:.2f}%, romad: {:.4f}'
                      .format(i, env.profit*100, env.buy_hold*100, env.sp, env.mdd*100, env.romad))
                break
示例#25
0
    def learn(self, initial_models):
        mesa_algo = TD3(
            "MlpPolicy", self.env, verbose=1, learning_starts=1
        )  # Note: Unecessarily initializes parameters (could speed up a bit by fixing)'

        mesa_algo.set_parameters(to_torch(initial_models), exact_match=False)
        LOG_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/logs/"
        MODEL_DIR = "/home/jet/catkin_ws/src/marsha/marsha_ai/training/models/"

        callback_list = []
        callback_list.append(TensorboardCallback())
        callback_list.append(
            StopTrainingOnMaxEpisodes(max_episodes=5, verbose=1))
        """callback_list.append(EvalCallback(self.env, best_model_save_path=MODEL_DIR, log_path=LOG_DIR,
                                    deterministic=True,
                                    eval_freq=5,
                                    n_eval_episodes=1))"""
        mesa_algo.learn(total_timesteps=1000, callback=callback_list
                        )  #rospy.get_param("/hyperparameters/total_timesteps")

        print("finished training! Testing mesa network...")
        test_buffer = ReplayBuffer(100,
                                   TaskEnv.observation_space,
                                   TaskEnv.action_space,
                                   device="cuda")

        test_env = Monitor(self.env)
        done = False
        ob = test_env.reset()
        while not done:
            action, state = mesa_algo.predict(ob)
            next_ob, reward, done, info = test_env.step(action)
            test_buffer.add(ob, next_ob, action, reward, done, [info])
            ob = next_ob

        meta_buffer = {"test": test_buffer, "train": mesa_algo.replay_buffer}

        optimized_mesa_parameters = mesa_algo.get_parameters()
        tf_mesa_models = from_torch(optimized_mesa_parameters)

        return meta_buffer, tf_mesa_models
示例#26
0
def train():

    log_dir = f"model_save/"
    env = ENV(istest=False)
    env = Monitor(env, log_dir)
    env = DummyVecEnv([lambda: env])
    # env = VecNormalize(env, norm_obs=True, norm_reward=True,
    #                clip_obs=10.)

    model = TD3('MlpPolicy',
                env,
                verbose=1,
                batch_size=PARAM['batch_size'],
                seed=PARAM['seed'],
                learning_starts=PARAM['learning_starts'])
    callback = SaveOnBestTrainingRewardCallback(check_freq=480,
                                                log_dir=log_dir)
    model.learn(total_timesteps=int(PARAM['total_time_step']),
                callback=callback,
                log_interval=480)
    model.save('model_save/' + MODEL_PATH)
示例#27
0
    def __init__(self):

        #self.observation_space = TaskEnv.observation_space
        #self.action_space = TaskEnv.action_space

        self.ros_interface = CatchInterface()
        self.env = MarshaGym(self.ros_interface)
        self.mesa_algo = TD3("MlpPolicy", self.env)
        self.tasks = [Task(self.env), Task(self.env)]
        self.replay_buffer = None  # will use one of the task replay buffers
        self.lambda_reg = 2.0  # Regularization Strength (2.0 according to iMAML paper)

        self.meta_models = {
            "actor": Actor(),
            "critic_0": Critic(),
            "critic_1": Critic()
        }

        self.loss_functions = [self.actor_loss, self.critic_loss]

        self.optimized_mesa_models = None
示例#28
0
def make_model(config, env):
    policy = config["policy_name"]

    if config["policy_name"] == "CustomTCNPolicy":
        policy = customActorCriticPolicyWrapper(
            env.observation_space.shape[0] // config["obs_input"],
            config["obs_input"])

    tb_log = None
    if config["tensorboard_log"]:
        tb_log = "./tb/{}/".format(config["session_ID"])

    ou_noise = None
    if config["ou_noise"]:
        ou_noise = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(env.action_space.shape[0]),
            sigma=config["ou_sigma"] * np.ones(env.action_space.shape[0]),
            theta=config["ou_theta"],
            dt=config["ou_dt"],
            initial_noise=None)

    model = TD3(policy=policy,
                env=env,
                buffer_size=config["buffer_size"],
                learning_starts=config["learning_starts"],
                action_noise=ou_noise,
                target_policy_noise=config["target_policy_noise"],
                target_noise_clip=config["target_noise_clip"],
                gamma=config["gamma"],
                tau=config["tau"],
                learning_rate=eval(config["learning_rate"]),
                verbose=config["verbose"],
                tensorboard_log=tb_log,
                device="cpu",
                policy_kwargs=dict(net_arch=[
                    int(config["policy_hid_dim"]),
                    int(config["policy_hid_dim"])
                ]))

    return model
示例#29
0
def main():
    # Create log dir
    log_dir = './td3_data'
    os.makedirs(log_dir, exist_ok=True)

    vix_env = trading_vix_env.trading_vix_env()
    env = Monitor(vix_env, log_dir)

    # Create action noise because TD3 and DDPG use a deterministic policy
    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))
    # Create the callback: check every 20000 steps
    callback = custom_call_back.CustomCallback(check_freq=20000,
                                               log_dir=log_dir)
    # Create RL model
    model = TD3('MlpPolicy',
                env,
                action_noise=action_noise,
                verbose=2,
                batch_size=10000)
    # Train the agent
    model.learn(total_timesteps=int(5e9), callback=callback)
示例#30
0
    algo = ARGS.exp.split("-")[2]

    if os.path.isfile(ARGS.exp + '/success_model.zip'):
        path = ARGS.exp + '/success_model.zip'
    elif os.path.isfile(ARGS.exp + '/best_model.zip'):
        path = ARGS.exp + '/best_model.zip'
    else:
        print("[ERROR]: no model under the specified path", ARGS.exp)
    if algo == 'a2c':
        model = A2C.load(path)
    if algo == 'ppo':
        model = PPO.load(path)
    if algo == 'sac':
        model = SAC.load(path)
    if algo == 'td3':
        model = TD3.load(path)
    if algo == 'ddpg':
        model = DDPG.load(path)

    #### Parameters to recreate the environment ################
    env_name = ARGS.exp.split("-")[1] + "-aviary-v0"
    OBS = ObservationType.KIN if ARGS.exp.split(
        "-")[3] == 'kin' else ObservationType.RGB
    if ARGS.exp.split("-")[4] == 'rpm':
        ACT = ActionType.RPM
    elif ARGS.exp.split("-")[4] == 'dyn':
        ACT = ActionType.DYN
    elif ARGS.exp.split("-")[4] == 'pid':
        ACT = ActionType.PID
    elif ARGS.exp.split("-")[4] == 'vel':
        ACT = ActionType.VEL