예제 #1
0
파일: train.py 프로젝트: xc-jp/InsertionAI
def main():
    parser = argparse.ArgumentParser("Insertion, Manual mode")
    parser.add_argument('--host',
                        default="127.0.0.1",
                        type=str,
                        help='IP of the server')
    parser.add_argument(
        '--port',
        default=9081,
        type=int,
        help='Port that should be used to connect to the server')
    parser.add_argument('--save',
                        action="store_true",
                        help=('Saves checkpoints'))
    parser.add_argument(
        '--use_coord',
        action="store_true",
        help=('If set, the environment\'s observation space will be'
              'coordinates instead of images'))
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    env = gym.make('insertion-v0',
                   kwargs={
                       'host': args.host,
                       "port": args.port,
                       "use_coord": args.use_coord
                   })
    # check_env(env, warn=True)

    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}")
    # print(env.action_space.sample())

    # Save a checkpoint every 50000 steps
    ckpt = CkptCallback(save_freq=50000,
                        save_path='../checkpoints/',
                        name_prefix='rl_insertion') if args.save else None

    if args.use_coord:
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    else:
        model = SAC('CnnPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")

    model.learn(50001, callback=ckpt)
예제 #2
0
def main():
    parser = argparse.ArgumentParser("Insertion, Manual mode")
    parser.add_argument('checkpoint_path', type=str, help='Path to checkpoint')
    parser.add_argument('--host',
                        default="192.168.2.121",
                        type=str,
                        help='IP of the server (default is a Windows#2)')
    parser.add_argument(
        '--port',
        default=9090,
        type=int,
        help='Port that should be used to connect to the server')
    parser.add_argument(
        '--use_coord',
        action="store_true",
        help=('If set, the environment\'s observation space will be'
              'coordinates instead of images'))
    args = parser.parse_args()

    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    env = gym.make('insertion-v0',
                   kwargs={
                       'host': args.host,
                       "port": args.port,
                       "use_coord": args.use_coord
                   })

    print(f"Observation space: {env.observation_space}")
    print(f"Action space: {env.action_space}")

    if args.use_coord:
        model = SAC('MlpPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    else:
        model = SAC('CnnPolicy',
                    env,
                    verbose=1,
                    tensorboard_log="../insertion_tensorboard/")
    model.load(args.checkpoint_path, env=env)

    obs = env.reset()
    for i in range(10000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
예제 #3
0
def main(argv):
    fixed = True

    policy_name = "sac_reaching_policy"

    obj_pose_rnd_std = 0 if fixed == True else 0.05
    pandaenv = pandaReachGymEnv(renders=True,
                                use_IK=0,
                                numControlledJoints=7,
                                obj_pose_rnd_std=obj_pose_rnd_std,
                                includeVelObs=True)
    n_actions = pandaenv.action_space.shape[-1]

    pandaenv = DummyVecEnv([lambda: pandaenv])

    model = SAC(MlpPolicy,
                pandaenv,
                gamma=0.9,
                batch_size=16,
                verbose=1,
                tensorboard_log="../pybullet_logs/pandareach_sac/")

    model.learn(total_timesteps=1000000)

    model.save("../pybullet_logs/pandareach_sac/" + policy_name)

    del model  # remove to demonstrate saving and loading
예제 #4
0
def func_run(env, logger, lr, action_noise, file):
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 5e7

    save_video_length = 200
    save_video_interval = 1000000
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)
    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=lr,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    exp_name = expDir + "/%s/%s_%s" % (name, np.format_float_scientific(nIter),
                                       np.format_float_scientific(lr))
    model.save(exp_name)
    file.write(exp_name + '\n')
    env.close()
    return True
예제 #5
0
def train_sac(training_tag):
    env = gym.make(ENVIRONMENT_NAME)
    env = DummyVecEnv([lambda: env])

    if (isinstance(training_tag, float)):
        model = SAC(sac_MlpPolicy,
                    env,
                    ent_coef=training_tag,
                    verbose=1,
                    policy_kwargs=POLICY_KWARGS)
        for step in range(TRAINING_STEPS):
            env.reset()

            (model, learning_results) = model.learn(
                total_timesteps=TRAINING_TIMESTEPS, log_interval=100)

            file_tag = str(training_tag).replace(".", "p")
            if (SAVE_AGENTS):
                model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_s" +
                           str(step) + "_t" + str(file_tag) + "_i" +
                           str(CURRENT_ITERATION) + "_ts" +
                           str(TRAINING_TIMESTEPS))

        if (SAVE_FINAL_AGENT):
            model.save("nchain/models/SAC_" + ENVIRONMENT_NAME + "_t" +
                       str(file_tag) + "_i" + str(CURRENT_ITERATION) + "_ts" +
                       str(TRAINING_STEPS * TRAINING_TIMESTEPS))

        env.reset()
        del model

    return data
예제 #6
0
 def explore(app,
             emulator,
             appium,
             timesteps,
             timer,
             save_policy,
             policy_dir,
             cycle,
             train_freq=5,
             target_update_interval=10):
     try:
         env = TimeFeatureWrapper(app)
         model = SAC(MlpPolicy,
                     env,
                     verbose=1,
                     train_freq=train_freq,
                     target_update_interval=target_update_interval)
         callback = TimerCallback(timer=timer, app=app)
         model.learn(total_timesteps=timesteps, callback=callback)
         if save_policy:
             model.save(f'{policy_dir}{os.sep}{cycle}')
         return True
     except Exception as e:
         print(e)
         appium.restart_appium()
         if emulator is not None:
             emulator.restart_emulator()
         return False
예제 #7
0
def run_experiment(verbose, tensorboard_log, learning_rate):
    pdb.set_trace()
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    env = VecVideoRecorder(
        env,
        osp.join(logger, "videos"),
        record_video_trigger=lambda x: x % save_video_interval == 0,
        video_length=save_video_length)

    n_actions = env.action_space.shape[-1]
    stddev = 0.2
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = SAC(
        MlpPolicy,
        env,
        verbose=verbose,
        tensorboard_log=logger,
        learning_rate=learning_rate,
        action_noise=action_noise,
    )
    model.learn(total_timesteps=int(nIter), log_interval=100)
    model.save(expDir + "/%s/%s_%s" %
               (name, np.format_float_scientific(nIter),
                np.format_float_scientific(learning_rate)))
    env.close()
예제 #8
0
def model_training_learning(env_train, model_name, timesteps=100000):

    # train model
    os.chdir("./model_saved/" + model_name)
    start = time.time()
    print("Train ", model_name, " Model with MlpPolicy: ")

    if model_name == "A2C_Model":
        model = A2C('MlpPolicy', env_train, verbose=0)
    elif model_name == "PPO_Model":
        model = PPO2('MlpPolicy', env_train, verbose=0)
    elif model_name == "TD3_Model":
        model = TD3('MlpPolicy', env_train, verbose=0)
    elif model_name == "SAC_Model":
        model = SAC('MlpPolicy', env_train, verbose=0)

    print("Learning ", model_name, " time steps: ", timesteps)

    model.learn(total_timesteps=timesteps)
    print("TD3 Model learning completed: ")
    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("- ", model_name, " save finish     :")
    print("Training time  ", model_name, " : ", (end - start) / 60, " minutes")

    os.chdir("./..")
    os.chdir("./..")
    return model
예제 #9
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = make_mujoco_env(env, 0)
    env = Monitor(env, log_dir + "/")

    continue_train = False
    if continue_train:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  #action_noise=action_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
예제 #10
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array(
                             [250, 0, 125, 0, 750, 0, 0, 0.005]),
                         fractional_reward_weight=1,
                         goal_height=0.15,
                         tool_block_mass=0.02)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    set_global_seeds(seed_num)
    policy_kwargs = dict(layers=[256, 256])
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                policy_kwargs=policy_kwargs,
                **sac_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="sac",
                callback=checkpoint_callback)
    return
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    # env.render()
    env_ = gym.make(env_name)

    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_SAC_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)
    env = Monitor(env, filename=path)
    ############################
    #          Logging         #
    ############################
    logger.configure(path)
    config = {}
    config['load'] = [{'load_model': load_model}]
    config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
    config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
    config['policy'] = [{'policy_network': policy_kwargs}]
    with open('./run/' + model_name + '/' + model_name + '.txt',
              'w+') as outfile:
        json.dump(config, outfile, indent=4)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    eval_callback = EvalCallback_wandb_SAC(env_,
                                           n_eval_episodes=eval_ep,
                                           eval_freq=eval_freq,
                                           log_path=path)
    callbacklist.append(ckpt_callback)
    callbacklist.append(eval_callback)
    callback = CallbackList(callbacklist)

    ############################
    #            run           #
    ############################
    # policy_kwargs = dict(net_arch=[128, dict(vf=[256], pi=[16])])
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=int(num_time_steps),
                log_interval=20,
                callback=callback)
    model.save(path + "SAC_Walker2d")
예제 #12
0
def train(learning_rate, time_steps, env, model_path):
    
    tf.reset_default_graph()    # to avoid the conflict the existnat parameters, but not suggested for reuse parameters


    # default policy is MlpPolicy
    model = SAC(CustomSACPolicy, env, verbose=1,seed=10, n_cpu_tf_sess=16)
    model.learn(total_timesteps=int(time_steps), log_interval=1000, callback=callback)
    model.save(model_path)
예제 #13
0
def train_SAC(env_train, model_name, timesteps=50000):
    start = time.time()
    model = SAC('MlpPolicy', env_train, verbose=0)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (SAC): ', (end - start) / 60, ' minutes')
    return model
예제 #14
0
def train():
    machine = StateMachine()
    machine.initialize(headless=True)
    camera = Camera(machine)
    env = CustomEnv(machine, camera, state="vision")
    model = SAC(CnnPolicy, env, verbose=1, learning_starts=32, batch_size=32, \
                target_update_interval=32, tensorboard_log=dir_path+'/Logs/')
    model.learn(total_timesteps=2000, log_interval=1000000)
    model.save("Grasp_Model_Full_Pose")
예제 #15
0
def test_predict_SAC():
    '''
    Visualize predictions from a random policy.
    '''
    env = gym.make('KukaMujocoSAC-v0')
    model = SAC(SAC_MlpPolicy, env)
    obs = env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, rew, done, info = env.step(action, render=True)
예제 #16
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None):
    env = gym.make(env_id)

    model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)
    model.learn(total_timesteps=timesteps, log_interval=log_interval)

    save_model_weights(model, "sac", env_id, policy, seed)
예제 #17
0
def train():
    set_gpu()
    expDir = '/home/shivanik/lab/pointExp/state/'
    num_objs = 1

    verbose = 1
    name = 'sac_%d_0.5' % num_objs
    nIter = 1e8

    save_video_length = 200
    save_video_interval = 1000000
    file = open('sac_done.txt', 'w+')
    env = make_vec_env(
        'PointMassDense-%d-v1' % num_objs,
        1,
        wrapper_class=FlattenDictWrapper,
        wrapper_env_kwargs=['observation', 'achieved_goal', 'desired_goal'])
    n_actions = env.action_space.shape[-1]
    stddev = 0.2

    pool = multiprocessing.Pool(processes=4)
    for lr in [1e-5]:  #, 5e-4, 1e-5
        logger = osp.join(
            expDir, name, 'logs%s_%s' % (np.format_float_scientific(nIter),
                                         np.format_float_scientific(lr)))
        env = VecVideoRecorder(
            env,
            osp.join(logger, "videos"),
            record_video_trigger=lambda x: x % save_video_interval == 0,
            video_length=save_video_length)
        action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                         sigma=0.1 * np.ones(n_actions))

        # boo = pool.apply_async(func_run, args=(env, logger, lr, action_noise, file))
        model = SAC(
            MlpPolicy,
            env,
            verbose=verbose,
            tensorboard_log=logger,
            learning_rate=lr,
            action_noise=action_noise,
        )
        model.learn(total_timesteps=int(nIter), log_interval=100)
        exp_name = expDir + "/%s/%s_%s" % (name,
                                           np.format_float_scientific(nIter),
                                           np.format_float_scientific(lr))
        model.save(exp_name)
        file.write(exp_name + '\n')
        env.close()
    file.close()
    pool.close()
    pool.join()
예제 #18
0
def sac(env, seed):
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.1) *
                                                np.ones(n_actions))

    return SAC('MlpPolicy',
               env,
               learning_rate=0.001,
               action_noise=action_noise,
               verbose=1,
               tensorboard_log="./data/runs",
               seed=seed)
예제 #19
0
    def run(self):
        self._init()

        env = self.env
        model = self.model
        objective = self.objective

        if objective == "infogain":
            wenv = InfogainEnv(env, model)
        elif objective == "prederr":
            wenv = PrederrEnv(env, model)
        else:
            raise AttributeError(
                "Objective '{}' is unknown. Needs to be 'infogain' or 'prederr'"
                .format(objective))

        wenv.max_episode_len = self.horizon
        wenv.end_episode_callback = self._end_episode
        dvenv = DummyVecEnv([lambda: wenv])

        if self.rl_algo == "ddpg":
            self.logger.info("Setting up DDPG as model-free RL algorithm.")
            pn = AdaptiveParamNoiseSpec()
            an = NormalActionNoise(np.array([0]), np.array([1]))
            rl_model = DDPG(DDPGMlpPolicy,
                            dvenv,
                            verbose=1,
                            render=False,
                            action_noise=an,
                            param_noise=pn,
                            nb_rollout_steps=self.horizon,
                            nb_train_steps=self.horizon)
        elif self.rl_algo == "sac":
            self.logger.info("Setting up SAC as model-free RL algorithm.")
            rl_model = SAC(SACMlpPolicy,
                           dvenv,
                           verbose=1,
                           learning_starts=self.horizon)
        else:
            raise AttributeError(
                "Model-free RL algorithm '{}' is unknown.".format(
                    self.rl_algo))

        # Train the agent
        max_steps_total = self.horizon * self.n_episodes * 100
        try:
            self.logger.info("Start the agent")
            rl_model.learn(total_timesteps=max_steps_total, seed=self.seed)
        except MaxEpisodesReachedException:
            print("Exploration finished.")
예제 #20
0
def main():
    env = gym.make("teaching-env-v0",
                   teacher_path=os.path.join(os.getcwd(), "../saved_models",
                                             sys.argv[1]),
                   validation_path=DATA_PATH,
                   max_queries=config.MAX_QUERIES)
    agent_model = SAC(MlpPolicy,
                      env,
                      train_freq=1,
                      batch_size=64,
                      learning_rate=3e-4,
                      learning_starts=0,
                      buffer_size=1000,
                      random_exploration=config.EPSILON_EXPLORATION,
                      gamma=config.GAMMA,
                      verbose=1)
    #agent_model.learn(total_timesteps=config.MAX_QUERIES * config.NUM_TRAIN_EPISODES)
    #agent_model.save('test_SAC')

    agent_model.load('test_SAC', env=env)

    obs = env.reset()

    total_reward = float('-inf')
    prog = tqdm(range(config.MAX_QUERIES), postfix={'Reward': total_reward})

    actions = []  # For visualization
    total_reward = 0.0
    for i in prog:
        action = select_action(agent_model,
                               obs,
                               epsilon=config.EPSILON_EXPLORATION)
        #action, _states = agent_model.predict(obs, deterministic=False)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        prog.set_postfix({'Reward': total_reward})
        actions.append(np.asscalar(action))
    plt.hist(actions, bins=config.NUM_BINS, range=(-5, 5), density=True)
    plt.savefig('./visualizations/histograms/SAC')
    plt.clf()

    # Plot student's predicted function
    inputs = np.linspace(-5, 5, num=1000)
    outputs = env.student_model(inputs.reshape(-1, 1))
    plt.scatter(inputs, outputs, s=0.1, label='SAC')
    plt.title("SAC Student's Approximation")
    plt.ylim((-60, 100))
    plt.savefig('./visualizations/functions/SAC')
    plt.clf()
예제 #21
0
def train_GAIL(env_train, model_name, timesteps=1000):
    """GAIL Model"""
    #from stable_baselines.gail import ExportDataset, generate_expert_traj
    start = time.time()
    # generate expert trajectories
    model = SAC('MLpPolicy', env_train, verbose=1)
    generate_expert_traj(model, 'expert_model_gail', n_timesteps=100, n_episodes=10)

    # Load dataset
    dataset = ExpertDataset(expert_path='expert_model_gail.npz', traj_limitation=10, verbose=1)
    model = GAIL('MLpPolicy', env_train, dataset, verbose=1)

    model.learn(total_timesteps=1000)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (PPO): ', (end - start) / 60, ' minutes')
    return model
예제 #22
0
def train_SAC(env, out_dir, seed=None, **kwargs):

    # Logs will be saved in log_dir/monitor.csv
    global output_dir
    output_dir = out_dir
    log_dir = os.path.join(out_dir, 'log')
    os.makedirs(log_dir, exist_ok=True)
    env = gym.make(env)
    env = Monitor(env, log_dir + '/', allow_early_resets=True)

    # Delete keys so the dict can be pass to the model constructor
    # policy = kwargs['policy']
    policy = 'MlpPolicy'
    # n_timesteps = kwargs['n_timesteps']
    n_timesteps = int(1e6)
    noise_type = None
    # Add some param noise for exploration
    param_noise = AdaptiveParamNoiseSpec(initial_stddev=0.1,
                                         desired_action_stddev=0.1)

    continue_model = False
    if continue_model is True:
        # Continue training
        print("Loading pretrained agent")
        model = SAC.load(os.path.join(out_dir, 'final_model.pkl'),
                         env=env,
                         tensorboard_log=os.path.join(log_dir, 'tb'),
                         verbose=1,
                         **kwargs)
    else:
        model = SAC(
            policy,
            env,  # action_noise=param_noise,
            verbose=1,
            tensorboard_log=os.path.join(log_dir, 'tb'),
            full_tensorboard_log=False,
            **kwargs)

    model.learn(total_timesteps=n_timesteps,
                seed=seed,
                callback=callback,
                log_interval=10)

    return model
예제 #23
0
def sac(env_id,
        timesteps,
        policy="MlpPolicy",
        log_interval=None,
        tensorboard_log=None,
        seed=None,
        load_weights=None):
    env = gym.make(env_id)

    if load_weights is not None:
        model = SAC.load(load_weights, env, verbose=0)
    else:
        model = SAC(policy, env, verbose=1, tensorboard_log=tensorboard_log)

    callback = WandbRenderEnvCallback(model_name="sac", env_name=env_id)

    model.learn(total_timesteps=timesteps,
                log_interval=log_interval,
                callback=callback)
예제 #24
0
def train_SAC(env_train, model_name, timesteps=100000):

    # train SAC model
    os.chdir("./model_saved/")
    start = time.time()
    print("Train SAC Model with MlpPolicy: ")

    model = SAC('MlpPolicy', env_train, verbose=0)
    print("SAC Learning time steps: ", timesteps)
    model.learn(total_timesteps=timesteps)
    print("SAC Model learning completed: ")

    end = time.time()
    timestamp = time.strftime('%b-%d-%Y_%H%M')
    model_file_name = (model_name + timestamp)
    model.save(model_file_name)
    print("SAC Model save finish     :")
    print('Training time SAC: ', (end - start) / 60, ' minutes')
    os.chdir("./..")

    return model
예제 #25
0
def run(env_name, algorithm, seed):
    env_name_map = {
        'halfcheetah': 'HalfCheetah-v2',
        'hopper': 'Hopper-v2',
        'ant': 'Ant-v2',
        'walker': 'Walker2d-v2'
    }
    env = DummyVecEnv([lambda: gym.make(env_name_map[env_name])])

    if algorithm == 'ppo':
        model = PPO2('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    elif algorithm == 'trpo':
        model = TRPO('MlpPolicy', env, max_kl=0.01, verbose=1)
    elif algorithm == 'sac':
        model = SAC('MlpPolicy', env, learning_rate=1e-3, verbose=1)
    else:
        raise NotImplementedError()

    filepath = '%s_%s_%d.pkl' % (env_name, algorithm, seed)
    model.learn(total_timesteps=100000, seed=seed)
    model.save(filepath)
예제 #26
0
 def train(self, num_e=1, n_timesteps=10000000, save_fraction=0.1, save='saves/m1'):
     env_id = "default"
     num_e = 32  # Number of processes to use
     # Create the vectorized environment
     #env = DummyVecEnv([lambda: env])
     #Ramona
     #self.env = SubprocVecEnv([self.make_env(env_id, i) for i in range(num_e)])
     env = Template_Gym()
     self.env = DummyVecEnv([lambda: env])
     self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
     #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-5, tensorboard_log="./test6" )
     
     self.model = SAC(CustomPolicy_sac, self.env, verbose=1, learning_rate=1e-5, tensorboard_log="./m1lstm1")
     #self.model = PPO2.load("default9", self.env, policy=CustomPolicy, tensorboard_log="./test/" )
     n_timesteps = n_timesteps * save_fraction
     n_timesteps = int(n_timesteps)
     training_loop = 1 / save_fraction
     training_loop = int(training_loop)
     
     for i in range(training_loop):
         self.model.learn(n_timesteps)
         self.model.save(save+str(i))
예제 #27
0
def get_SAC_model(model_settings, model_path, ckpt_path, ckpt_step, tb_path):
    policy_kwargs = dict(layers=model_settings['NET_LAYERS'])
    env = get_single_process_env(model_settings, model_path, ckpt_step)
    if ckpt_path is not None:
        print("Loading model from checkpoint '{}'".format(ckpt_path))
        model = SAC.load(ckpt_path,
                         env=env,
                         _init_setup_model=True,
                         policy_kwargs=policy_kwargs,
                         **model_settings['train_configs'],
                         verbose=1,
                         tensorboard_log=tb_path)
        model.num_timesteps = ckpt_step
    else:
        model = SAC(SACMlpPolicy,
                    env,
                    _init_setup_model=True,
                    policy_kwargs=policy_kwargs,
                    **model_settings['train_configs'],
                    verbose=1,
                    tensorboard_log=tb_path)
    return model, env
    def __init__(self):
        logger.info(os.getcwd())
        #self._interpreter = Interpreter("./converted_model.tflite")
        #self._interpreter.allocate_tensors()
        #print(self._interpreter.get_input_details())
        #print(self._interpreter.get_output_details())
        #_, self._input_height, self._input_width, _ = self._interpreter.get_input_details()[0]['shape']

        self.env = AutoDriftEnv(const_throttle=0.3)
        # self.model = SacModel(policy=CnnPolicy, env=self.env)
        self.model = SAC(policy=CnnPolicy, env=self.env)

        # self._input_height = IMAGE_HEIGHT
        # self._input_width = IMAGE_WIDTH
        # print(self._input_height)
        # print(self._input_width)

        # self._socket = socket.socket()
        # socket_addr = ('127.0.0.1', 8888)
        # UNCOMMENT THIS
        #self._socket.connect(socket_addr)

        self.main()
    def train_SAC(self, model_name, model_params=config.SAC_PARAMS):
        """TD3 model"""
        from stable_baselines import SAC

        env_train = self.env

        start = time.time()
        model = SAC(
            'MlpPolicy',
            env_train,
            batch_size=model_params['batch_size'],
            buffer_size=model_params['buffer_size'],
            learning_rate=model_params['learning_rate'],
            learning_starts=model_params['learning_starts'],
            ent_coef=model_params['ent_coef'],
            verbose=model_params['verbose'],
            tensorboard_log=f"{config.TENSORBOARD_LOG_DIR}/{model_name}")
        model.learn(total_timesteps=model_params['timesteps'],
                    tb_log_name="SAC_run")
        end = time.time()

        model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
        print('Training time (SAC): ', (end - start) / 60, ' minutes')
        return model
예제 #30
0
def get_sac(env, **kwargs):
    env_id = env.unwrapped.spec.id
    if (env_id.startswith("Ant") or env_id.startswith("HalfCheetah")
            or env_id.startswith("Swimmer") or env_id.startswith("Fetch")):
        sac_kwargs = {
            "verbose": 1,
            "learning_rate": 3e-4,
            "gamma": 0.98,
            "tau": 0.01,
            "ent_coef": "auto",
            "buffer_size": 1000000,
            "batch_size": 256,
            "learning_starts": 10000,
            "train_freq": 1,
            "gradient_steps": 1,
        }
        policy = CustomSACPolicy
    elif env_id.startswith("Hopper"):
        sac_kwargs = {
            "verbose": 1,
            "learning_rate": 3e-4,
            "ent_coef": 0.01,
            "buffer_size": 1000000,
            "batch_size": 256,
            "learning_starts": 1000,
            "train_freq": 1,
            "gradient_steps": 1,
        }
        policy = CustomSACPolicy
    else:
        sac_kwargs = {"verbose": 1, "learning_starts": 1000}
        policy = MlpPolicySac
    for key, val in kwargs.items():
        sac_kwargs[key] = val
    solver = SAC(policy, env, **sac_kwargs)
    return solver