Пример #1
0
def main():
    #Create Asynchronous Simulation of InvertedDoublePendulum-v2 mujoco environment.
    env = DoubleInvertedPendulumEnv(
        agent_dt=0.005,
        sensor_dt=[0.01, 0.0033333],
    )
    # Start environment processes
    env.start()

    # Create baselines ppo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_returns,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn

    kindred_callback = create_callback(shared_returns)

    # Train baselines PPO
    learn(
        env,
        policy_fn,
        max_timesteps=1e6,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.0001,
        optim_batchsize=64,
        gamma=0.995,
        lam=0.995,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Пример #2
0
def train(cfg):
    """ Function to start training and logging processes

    Args:
        cfg (dict): Configuration parameters loaded into dict from yaml file
    """

    artifact_path = cfg['train']['artifact_path']

    # Get environment
    m = cfg['environment']['module']
    t = imp.load_module(m, *imp.find_module(m))
    get_env = getattr(t, cfg['environment']['class'])
    env, policy_fn = get_env(cfg)

    # Create and start logging process
    log_running = Value('i', 1)
    # Manager to share data between log process and main process
    shared_returns = Manager().dict({
        'write_lock': False,
        'episodic_returns': [],
        'episodic_lengths': [],
    })

    # Spawn logging process
    pp = Process(
        target=log_function,
        args=(env, cfg['algorithm']['hyperparameters']['timesteps_per_batch'],
              shared_returns, log_running, artifact_path))
    pp.start()

    # Create callback function for logging data from learn
    kindred_callback = create_callback(shared_returns)

    # Train
    m = importlib.import_module(cfg['algorithm']['codebase']['module'])
    learn = getattr(m, cfg['algorithm']['codebase']['class'])
    logging.debug("Learn function: {}".format(learn))

    learn(env,
          policy_fn,
          max_timesteps=cfg['algorithm']['hyperparameters']['max_timesteps'],
          timesteps_per_batch=cfg['algorithm']['hyperparameters']
          ['timesteps_per_batch'],
          max_kl=cfg['algorithm']['hyperparameters']['max_kl'],
          cg_iters=cfg['algorithm']['hyperparameters']['cg_iters'],
          cg_damping=cfg['algorithm']['hyperparameters']['cg_damping'],
          vf_iters=cfg['algorithm']['hyperparameters']['vf_iters'],
          vf_stepsize=cfg['algorithm']['hyperparameters']['vf_stepsize'],
          gamma=cfg['algorithm']['hyperparameters']['gamma'],
          lam=cfg['algorithm']['hyperparameters']['lam'],
          callback=kindred_callback)

    # Safely terminate plotter process
    log_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Пример #3
0
def main():
    # Create the Create2 mover environment
    env = Create2MoverEnv(90, port='/dev/ttyUSB0', obs_history=1, dt=0.15)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines trpo policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines PPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Пример #4
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = ReacherEnv(setup="UR5_6dof",
                     host=None,
                     dof=6,
                     control_type="velocity",
                     target_type="position",
                     reset_type="zero",
                     reward_type="precision",
                     derivative_type="none",
                     deriv_action_max=5,
                     first_deriv_max=2,
                     accel_max=1.4,
                     speed_max=0.3,
                     speedj_a=1.4,
                     episode_length_time=4.0,
                     episode_length_step=None,
                     actuation_sync_period=1,
                     dt=0.04,
                     run_mode="multiprocess",
                     rllab_box=False,
                     movej_t=2.0,
                     delay=0.0,
                     random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Пример #5
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create DXL Reacher1D environment
    env = DxlReacher1DEnv(setup='dxl_gripper_default',
                          idn=1,
                          baudrate=1000000,
                          obs_history=1,
                          dt=0.04,
                          gripper_dt=0.01,
                          rllab_box=False,
                          episode_length_step=None,
                          episode_length_time=2,
                          max_torque_mag=100,
                          control_type='torque',
                          target_type='position',
                          reset_type='zero',
                          reward_type='linear',
                          use_ctypes_driver=True,
                          random_state=rand_state)

    # The outputs of the policy function are sampled from a Gaussian. However, the actions in terms of torque
    # commands are in the range [-max_torque_mag, max_torque_mag]. NormalizedEnv wrapper scales action accordingly.
    # By default, it does not normalize observations or rewards.
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=32,
                         num_hid_layers=2)

    # create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Plotting process
    pp = Process(target=plot_dxl_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(
        env,
        policy_fn,
        max_timesteps=50000,
        timesteps_per_batch=2048,
        max_kl=0.05,
        cg_iters=10,
        cg_damping=0.1,
        vf_iters=5,
        vf_stepsize=0.001,
        gamma=0.995,
        lam=0.995,
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    # Shutdown the environment
    env.close()
Пример #6
0
def main():
    # optionally use a pretrained model
    load_model_data = None
    hidden_sizes = (32, 32)
    if len(sys.argv) > 1:
        load_model_path = sys.argv[1]
        load_model_data = pkl.load(open(load_model_path, 'rb'))
        hidden_sizes = load_model_data['hidden_sizes']

    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 mover environment
    env = Create2MoverEnv(90,
                          port='/dev/ttyUSB0',
                          obs_history=1,
                          dt=0.15,
                          random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=hidden_sizes[0],
                         num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_create2_mover,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, load_model_data)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=40000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Пример #7
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)

    # Create UR10 Reacher2D environment
    env = ReacherEnvWithRealSense(setup="UR10_default",
                                  camera_hosts=('localhost', ),
                                  camera_ports=(5000, ),
                                  camera_res=(3, 480, 640),
                                  host=None,
                                  dof=2,
                                  control_type="velocity",
                                  target_type="position",
                                  reset_type="zero",
                                  reward_type="precision",
                                  derivative_type="none",
                                  deriv_action_max=5,
                                  first_deriv_max=2,
                                  accel_max=1.4,
                                  speed_max=1.0,
                                  speedj_a=1.4,
                                  episode_length_time=4.0,
                                  episode_length_step=None,
                                  actuation_sync_period=1,
                                  dt=0.5,
                                  run_mode="multiprocess",
                                  rllab_box=False,
                                  movej_t=2.0,
                                  delay=0.0,
                                  random_state=rand_state)
    env = NormalizedEnv(env)
    # Start environment processes
    env.start()

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur10_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # env.action_space.shape
    # Train baselines TRPO
    for episode in range(10):
        print(f"Episode: {episode + 1}")
        done = False
        timestep = 0
        curr_obs = env.reset()
        while not done:
            if timestep % 3 == 0:
                action = np.random.normal(scale=0.1, size=(2, ))
            print(action)
            next_obs, reward, done, _ = env.step(action)

            timestep += 1
            curr_obs = next_obs

            if timestep == 15:
                done = True

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
Пример #8
0
def main():
    # use fixed random state
    rand_state = np.random.RandomState(1).get_state()
    np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create UR5 Reacher2D environment
    env = SawyerReachXYZEnv(target_goal=(0, 0, 0),
                            indicator_threshold=.05,
                            reward_type='hand_distance',
                            action_mode='torque',
                            use_safety_box=True,
                            torque_action_scale=1,
                            **kwargs)

    env = NormalizedEnv(env)
    # Start environment processes
    env.start()
    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=ob_space,
                         ac_space=ac_space,
                         hid_size=64,
                         num_hid_layers=2)

    # Create and start plotting process
    plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
    })
    # Spawn plotting process
    pp = Process(target=plot_ur5_reacher,
                 args=(env, 2048, shared_returns, plot_running))
    pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns)

    # Train baselines TRPO
    learn(env,
          policy_fn,
          max_timesteps=200000,
          timesteps_per_batch=2048,
          max_kl=0.05,
          cg_iters=10,
          cg_damping=0.1,
          vf_iters=5,
          vf_stepsize=0.001,
          gamma=0.995,
          lam=0.995,
          callback=kindred_callback)

    # Safely terminate plotter process
    plot_running.value = 0  # shutdown ploting process
    time.sleep(2)
    pp.join()

    env.close()
def main():
    # optionally use a pretrained model
    save_model_path = None
    load_model_path = None
    load_trained_model = False
    hidden_sizes = (64, 64, 64)

    if len(sys.argv) > 2:# load model
        load_trained_model = True

    save_model_path = sys.argv[1] # saved/uniform/X/Y/Z/
    os.makedirs(save_model_path, exist_ok=True)
    run_dirs = os.listdir(save_model_path)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1), exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/models', exist_ok=True)
    os.makedirs(save_model_path+'run_'+str(len(run_dirs)+1)+'/data', exist_ok=True)
    save_model_basepath = save_model_path+'run_'+str(len(run_dirs)+1)+'/'

    if load_trained_model:# loading true
        load_model_path = sys.argv[2] # saved/uniform/X/Y/Z/run_1/model*

    # use fixed random state
    #rand_state = np.random.RandomState(1).get_state()
    #np.random.set_state(rand_state)
    tf_set_seeds(np.random.randint(1, 2**31 - 1))

    # Create the Create2 docker environment
    # distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.575, 0.425, 0, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.25, 0.2, 0.55, 0, 0, 0, 0, 0, 0])
    # distro = np.array([0.1, 0.1, 0.25, 0.55, 0, 0, 0, 0, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0, 0, 0.475, 0])
    # FAILED: distro = np.array([0.05, 0.05, 0.15, 0.275, 0, 0.475, 0, 0, 0])
    # FAILED: distro = np.array([0.10, 0.05, 0.10, 0.35, 0, 0.4, 0, 0, 0])
    #distro = np.array([0.10, 0.05, 0.10, 0.375, 0.375, 0, 0, 0, 0]) #run 1
    # distro = np.array([0.06, 0.03, 0.06, 0.425, 0.425, 0, 0, 0, 0]) # run2
    # distro = np.array([0.025, 0.05, 0.05, 0.25, 0.25, 0.375, 0, 0, 0]) # part 1, first 100 episodes
    # OK: distro = np.array([0.05, 0.025, 0.05, 0.225, 0.225, 0.425, 0, 0, 0])
    #distro = np.array([0.025, 0.02, 0.025, 0.1375, 0.1375, 0.3275, 0.3275, 0, 0])
    # FAILED: distro = np.array([0.015, 0.015, 0.02, 0.06, 0.09, 0.35, 0.45, 0, 0])
    distro = np.array([0, 1, 0, 0, 0, 0, 0, 0, 0]) 

    env = Create2DockerEnv(30, distro,
                           port='/dev/ttyUSB0', ir_window=20,
                           ir_history=1,
                           obs_history=1, dt=0.045)
                           #random_state=rand_state)
    env = NormalizedEnv(env)

    # Start environment processes
    env.start()

    # Create baselines TRPO policy function
    sess = U.single_threaded_session()
    sess.__enter__()
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=hidden_sizes[0], num_hid_layers=len(hidden_sizes))

    # Create and start plotting process
    #plot_running = Value('i', 1)
    shared_returns = Manager().dict({
        "write_lock": False,
        "episodic_returns": [],
        "episodic_lengths": [],
        "episodic_ss": [],
    })
    # Spawn plotting process
    #pp = Process(target=plot_create2_docker, args=(env, 2048, shared_returns, plot_running))
    #pp.start()

    # Create callback function for logging data from baselines TRPO learn
    kindred_callback = create_callback(shared_returns, save_model_basepath, load_model_path)

    # Train baselines PPO
    model = learn(
        env,
        policy_fn,
        max_timesteps=100000,
        timesteps_per_actorbatch=675,#512
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=0.00005,
        optim_batchsize=16,
        gamma=0.96836,
        lam=0.99944,
        schedule="linear",
        callback=kindred_callback,
    )

    # Safely terminate plotter process
    #plot_running.value = 0  # shutdown ploting process
    #time.sleep(2)
    #pp.join()

    env.close()