예제 #1
0
def main(args):
    """
    Train and save the SAC model, for the halfcheetah problem

    :param args: (ArgumentParser) the input arguments
    """
    env = gym.make(args.env)
    test_env = gym.make(args.env)

    if args.ent_coef is None:
        args.ent_coef = 'auto'

    model = SAC(env=env,
                test_env=test_env,
                seed=int(args.seed),
                ent_coef=args.ent_coef,
                reward_scale=5.)
    ep_rewards = model.learn(total_timesteps=int(args.max_timesteps),
                             save_path=args.save_path)

    model.save(args.save_path + "/%s_model_seed%d_fin_auto.zip" %
               (args.env, int(args.seed)))
    np.save(
        args.save_path + "/%s_rews_seed%d_fin_auto.npy" %
        (args.env, int(args.seed)), np.array(ep_rewards))

    # print("Saving model to halfcheetah_model.zip")
    # model.learn(total_timesteps=100)
    # model.load("halfcheetah_model.zip")

    model.evaluate(10)
예제 #2
0
 def __init__(self):
     self.observation_reward = rospy.Subscriber("/rl/environment_response", reward_observation, self.policy, queue_size=10)
     self.act_pub = rospy.Publisher("/rl/final_action", action_agent, queue_size=10)
     self.prev_state = None
     self.state = None
     self.reward  = None
     self.final_state = None
     self.agent = SAC()
예제 #3
0
파일: main.py 프로젝트: ColorlessBoy/SAC
def run(args):
    env = gym.make(args.env)

    device = torch.device(args.device)

    # 1. Set some necessary seed.
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    random.seed(args.seed)
    env.seed(args.seed)

    # 2. Create nets.
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]
    hidden_sizes = (256, 256)
    ac = ActorCritic(state_size, action_size, hidden_sizes).to(device)
    ac_target = ActorCritic(state_size, action_size, hidden_sizes).to(device)
    hard_update(ac, ac_target)

    # env_sampler = EnvSampler(env, max_episode_step=4000, capacity=1e6)
    env_sampler = EnvSampler2(env, gamma=args.gamma1, capacity=1e6)

    alg = SAC(ac,
              ac_target,
              gamma=args.gamma2,
              alpha=0.2,
              q_lr=1e-3,
              pi_lr=1e-3,
              target_lr=5e-3,
              device=device)

    def get_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        return ac_target.get_action(state)

    def get_mean_action(state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        return ac_target.get_action(state, deterministic=True)

    start_time = time()
    for _ in range(args.start_steps):
        env_sampler.addSample()
    print("Warmup uses {}s.".format(time() - start_time))

    for step in range(1, args.total_steps + 1):
        env_sampler.addSample(get_action)

        if step % args.update_every == 0:
            for _ in range(args.update_every):
                batch = env_sampler.sample(args.batch_size)
                losses = alg.update(*batch)

        if step % args.test_every == 0:
            test_reward = env_sampler.test(get_mean_action)
            yield (step, test_reward, *losses)

    torch.save(ac.pi.state_dict(), './env_{}_pi_net.pth.tar'.format(args.env))
예제 #4
0
    def ros_init(self):
        if self.team == 'A':
            self.agent = SAC(act_dim=2, obs_dim=6,
            lr_actor=l_rate*(1e-3), lr_value=l_rate*(1e-3), gamma=0.99, tau=0.995)
    
            rospy.init_node('strategy_node_A', anonymous=True)
            # self.A_info_pub = rospy.Publisher('/nubot1/A_info', Float32MultiArray, queue_size=1) # 3in1
            self.vel_pub    = rospy.Publisher('/nubot1/nubotcontrol/velcmd', VelCmd, queue_size=1)
            self.reset_pub = rospy.Publisher('/gazebo/set_model_state', ModelState, queue_size=10)
            # self.ready2restart_pub  = rospy.Publisher('nubot1/ready2restart',Bool, queue_size=1)
            rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback)
            rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback)
            # rospy.Subscriber('/coach/state', String, self.state_callback)
            # rospy.Subscriber('/coach/reward', Float32, self.reward_callback)
            # rospy.Subscriber('/coach/done', Bool, self.done_callback)
            # rospy.Subscriber('coach/HowEnd', Int16, self.HowEnd_callback)
            # rospy.Subscriber("/rival1/steal", Bool, self.steal_callback)

            rospy.wait_for_service('/nubot1/Shoot')
            self.call_Shoot = rospy.ServiceProxy('/nubot1/Shoot', Shoot)
            
            # rospy.wait_for_service('/gazebo/reset_simulation')
            # self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty, persistent=True)

            # rospy.wait_for_service('/gazebo/set_model_state')
            # self.call_set_modol = rospy.ServiceProxy('/gazebo/set_model_state', SetModelState)
            rospy.wait_for_service('/nubot1/BallHandle')
            self.call_Handle = rospy.ServiceProxy('/nubot1/BallHandle', BallHandle)
            rospy.wait_for_service('/rival1/BallHandle')
            self.call_B_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle)

            

        elif self.team == 'B':
            rospy.init_node('strategy_node_B', anonymous=True)
            self.vel_pub   = rospy.Publisher('/rival1/nubotcontrol/velcmd', VelCmd, queue_size=1)
            self.steal_pub = rospy.Publisher('/rival1/steal', Bool, queue_size=1) # steal
            rospy.Subscriber("/rival1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback)
            rospy.Subscriber("/rival1/omnivision/OmniVisionInfo/GoalInfo", PPoint, self.GoalInfo)
            
            rospy.wait_for_service('/rival1/BallHandle')
            self.call_Handle = rospy.ServiceProxy('/rival1/BallHandle', BallHandle)


        else :
            rospy.init_node('coach', anonymous=True)
            self.state_pub  = rospy.Publisher('/coach/state', String, queue_size=1)
            self.reward_pub = rospy.Publisher('/coach/reward', Float32, queue_size=1)
            self.done_pub   = rospy.Publisher('coach/done', Bool, queue_size=1)
            self.HowEnd_pub = rospy.Publisher('coach/HowEnd', Int16, queue_size=1)
            rospy.Subscriber("/nubot1/omnivision/OmniVisionInfo", OminiVisionInfo, self.callback)
            rospy.Subscriber("/rival1/steal", Bool, self.steal_callback) # steal
            rospy.Subscriber("/nubot1/A_info", Float32MultiArray, self.A_info_callback)
            # rospy.Subscriber('gazebo/model_states', ModelStates, self.fly_callback)
            rospy.Subscriber('nubot1/ready2restart',Bool , self.ready2restart_callback)
            rospy.wait_for_service('/gazebo/reset_simulation')
            self.call_restart = rospy.ServiceProxy('/gazebo/reset_simulation', Empty)
예제 #5
0
def main(args):
    env = gym.make('Carla-v0', n_heroes=N_HEROES, port=PORT)
    replay = MultiReplayBuffer(CAPACITY)

    from sac import SAC
    import torch
    import bz_utils as bzu

    bzu.log.init('log_v1')

    updates = 0
    trainer = SAC(OBSERVATION_SHAPE, N_ACTIONS, args)
    agent = trainer.policy
    # agent.load_state_dict(torch.load('log/latest.t7'))

    for _ in tqdm.tqdm(range(1000)):
        totals = [0 for _ in range(N_HEROES)]
        finished = list()
        states = env.reset(n_vehicles=N_VEHICLES, n_pedestrians=N_PEDESTRIANS)

        for i in tqdm.tqdm(range(1000), desc='Experiences'):
            _, _, actions = agent.sample(preprocess(states))
            actions = actions.detach().cpu().numpy()
            new_states, rewards, dones, infos = env.step(actions)

            for j in range(N_HEROES):
                totals[j] += rewards[j]

                if dones[j]:
                    finished.append(totals[j])
                    totals[j] = 0

            # env.render()
            replay.add(states, actions, rewards, new_states, dones)

            states = new_states

        for j in range(N_HEROES):
            totals[j] += rewards[j]
            finished.append(totals[j])

        bzu.log.scalar(is_train=True, **{'cumulative': np.mean(finished)})

        for i in tqdm.tqdm(range(1000), desc='Batch'):
            loss_q1, loss_q2, p_loss, a_loss, a_tlog = trainer.update_parameters(
                replay, args.batch_size, updates)
            scalars = {
                'loss_q1': loss_q1,
                'loss_q2': loss_q2,
                'p_loss': p_loss,
                'a_loss': a_loss,
                'a_tlog': a_tlog,
            }
            bzu.log.scalar(is_train=True, **scalars)
            updates += 1

        bzu.log.end_epoch(agent)
예제 #6
0
def test_dpf_sac(d_path, s_path, threshold=0.02):
    results = []
    for _ in tqdm(range(10)):
        env = gym.make('ActivePerception-v0')
        env.sid = 9900  # test
        dpf = DPF().to(device)
        dpf.load_model(d_path)
        sac = SAC(24)
        sac.load_model(s_path)

        reward = 0
        for episode in tqdm(range(100)):
            scene_data, obs = env.reset(False)

            s = get_state(scene_data).to(device)  # [1, n_obj, dim_obj] state
            o = trans_rgb(obs['o']).to(device)  # [1, C, H, W]        rgb
            d = trans_d(obs['d']).to(device)  # [1, 1, H, W]        depth

            p, w, p_n, x, h = dpf(o, d, n_new=K)
            mean, var = get_variance(p, w)
            h_numpy = torch.cat((mean, var),
                                -1).view(-1).detach().cpu().numpy()

            steps = np.random.choice(7, 7, 0) + 1
            for step in steps:
                #th    = np.random.rand()*np.pi*2-np.pi
                #th    = np.pi/4*step
                th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item()
                obs = env.step(th)
                o = trans_rgb(obs['o']).to(device)
                d = trans_d(obs['d']).to(device)
                th = torch.FloatTensor([th]).view(1, -1).to(device)
                n_new = int(0.7 * K)  #int(K*(0.5**(_+1)))

                p, w, p_n, x, h = dpf(o, d, th, p, w, h, n_new, True)

                mean, var = get_variance(p, w)
                h_numpy = torch.cat((mean, var),
                                    -1).view(-1).detach().cpu().numpy()
                p_ = (F.softmax(w, 1).unsqueeze(2).unsqueeze(3) * p).sum(1)
                mse = F.mse_loss(p_, s).item()

                d = (mse < threshold)
                r = 8 if d else -1
                reward += r
                if d:
                    break

        results.append(reward / 100)
    results = np.array(results)
    print("DPF SAC avg reward %10.4f | std %10.4f" %
          (np.mean(results), np.std(results)))
예제 #7
0
def get_policy(buffer, model, measure, mode, d_state, d_action,
               policy_replay_size, policy_batch_size, policy_active_updates,
               policy_n_hidden, policy_lr, policy_gamma, policy_tau,
               policy_explore_alpha, policy_exploit_alpha, buffer_reuse,
               device, verbosity, _log):

    if verbosity:
        _log.info("... getting fresh agent")

    policy_alpha = policy_explore_alpha if mode == 'explore' else policy_exploit_alpha

    agent = SAC(d_state=d_state,
                d_action=d_action,
                replay_size=policy_replay_size,
                batch_size=policy_batch_size,
                n_updates=policy_active_updates,
                n_hidden=policy_n_hidden,
                gamma=policy_gamma,
                alpha=policy_alpha,
                lr=policy_lr,
                tau=policy_tau)

    agent = agent.to(device)
    agent.setup_normalizer(model.normalizer)

    if not buffer_reuse:
        return agent

    if verbosity:
        _log.info("... transferring exploration buffer")

    size = len(buffer)
    for i in range(0, size, 1024):
        j = min(i + 1024, size)
        s, a = buffer.states[i:j], buffer.actions[i:j]
        ns = buffer.states[i:j] + buffer.state_deltas[i:j]
        s, a, ns = s.to(device), a.to(device), ns.to(device)
        with torch.no_grad():
            mu, var = model.forward_all(s, a)
        r = measure(s, a, ns, mu, var, model)
        agent.replay.add(s, a, r, ns)

    if verbosity:
        _log.info("... transferred exploration buffer")

    return agent
예제 #8
0
def run(sdk_conn: cozmo.conn):
    """
    Container of the main loop. It is necessary to work with Cozmo. This is called by the cozmo.connect
    presents in the main loop of this file.

    :param sdk_conn: SDK connection to Anki Cozmo
    :type sdk_conn: cozmo.conn
    :return: nothing
    :rtype: nothing
    """
    gettrace = getattr(sys, 'gettrace', None)
    if gettrace is not None and gettrace():
        debug = True
    else:
        debug = False
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
    robot = sdk_conn.wait_for_robot()
    robot.enable_device_imu(True, True, True)
    # Turn on image receiving by the camera
    robot.camera.image_stream_enabled = True

    # Setting up Hyper-Parameters
    args, folder, logger, restore = initial_setup()
    # if not debug:
    # tb_tool = TensorBoardTool(folder)
    # tb_tool.run()
    logger.debug("Initial setup completed.")

    # Create JSON of Hyper-Parameters for reproducibility
    with open(folder + "hp.json", 'w') as outfile:
        json.dump(vars(args), outfile)

    # Initialize Environment
    gym_cozmo.initialize(robot, args.img_h, args.img_w)
    env = gym.make(args.env_name)

    # Setup the agent
    agent = SAC(args.state_buffer_size, env.action_space, env, args, folder,
                logger)
    i_run = args.run
    i_epi = args.episode
    agent.load_model_to_play(args.env_name, folder, i_run, i_epi)
    agent.play()
    env.close()
    logger.important("Program closed correctly!")
예제 #9
0
def test_rnn_sac(r_path, s_path, threshold=0.02):
    rnn = RNNFilter().to(device)
    rnn.load_model(r_path)
    sac = SAC()
    sac.load_model(s_path)
    results = []
    for _ in tqdm(range(10)):
        env = gym.make('ActivePerception-v0')
        env.sid = 9900  # test

        reward = 0
        for episode in range(100):
            scene_data, obs = env.reset(False)

            s = get_state(scene_data).to(device)  # [1, n_obj, dim_obj] state
            o = trans_rgb(obs['o']).to(device)  # [1, C, H, W]        rgb
            d = trans_d(obs['d']).to(device)  # [1, 1, H, W]        depth

            s_, h = rnn(o, d)
            h_numpy = h.view(-1).detach().cpu().numpy()
            steps = np.random.choice(7, 7, 0) + 1
            #for step in range(7):        # n_actions allowed
            for step in steps:
                #th    = 2*np.pi*np.random.rand()-np.pi
                th = sac.policy_net.get_action(h_numpy.reshape(1, -1)).item()
                #th   = np.pi/4*step
                obs = env.step(th)
                o = trans_rgb(obs['o']).to(device)
                d = trans_d(obs['d']).to(device)
                th = torch.FloatTensor([th]).view(1, -1).to(device)
                s_, h = rnn(o, d, th, h)
                h_numpy = h.view(-1).detach().cpu().numpy()
                mse = F.mse_loss(s_, s).item()
                d = (mse < threshold)
                r = 8 if d else -1
                reward += r
                if d:
                    break
        results.append(reward / 100)
    results = np.array(results)
    print("RNN SAC avg reward %10.4f | std %10.4f" %
          (np.mean(results), np.std(results)))
예제 #10
0
def test(arglist):
    env_name = arglist.env
    train_seed = arglist.train_seed
    test_seed = arglist.test_seed
    n_episodes = arglist.n_episodes
    render = arglist.render
    max_timesteps = 1001
    
    #env = gym.make(env_name)
    env = gen_envs(arglist)

    # Set random seed
    env.seed(test_seed)
    torch.manual_seed(test_seed)
    np.random.seed(test_seed)

    # load pretrained RL models
    agent = SAC(env.observation_space.shape[0], env.action_space, arglist)
    agent.load_model(env_name, train_seed)
    
    total_reward_list = []
    for ep in range(1, n_episodes+1):
        ep_reward = 0.0
        state = env.reset()
        for t in range(max_timesteps):
            noise = np.random.normal(0.0, 1.0, size=state.shape)
            noise = np.clip(noise, -1.0, 1.0)
            adv_state = state + arglist.noise_scale * noise
            action = agent.select_action(adv_state, eval=True)
            state, reward, done, _ = env.step(action)
            ep_reward += reward
            if render:
                env.render()
            if done:
                break
            
        #print('Episode: {}\tReward: {}'.format(ep, int(ep_reward)))
        total_reward_list.append(ep_reward)
        ep_reward = 0.0
    env.close()
    return total_reward_list
예제 #11
0
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.shape[0]
    max_action = env.action_space.high[0]
    state_dim = env.observation_space.shape[0]


    sac = SAC(args, action_dim, max_action, state_dim, device)
    summary = tensorboardX.SummaryWriter('./log/{}_sac_{}'.format(args['env_name'], args['noise_type']))

    timestep = 0
    start_time = time.time()
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()

        while True:
            action = sac.get_action(state)
            next_state, reward, done, info = env.step(action)
            sac.save(state, action, reward, next_state, int(done))
            episode_reward += reward
            state = next_state
            timestep += 1

            if sac.memory_counter > args['batch_size']: # BATCH_SIZE(64) 이상일 때 부터 train 시작
                sac.train()

            if done:
                print('episode: ', episode, '   reward : %.3f'%(episode_reward), '    timestep :', timestep)

                summary.add_scalar('reward/episode', episode_reward, episode)

                break

        if episode % args['save_freq'] == 0:
            if not os.path.exists('./SaveModel') :
                os.mkdir('./SaveModel')
            torch.save(sac.actor.state_dict(), './SaveModel/{}_sac_{}_{}'.format(args['env_name'], args['noise_type'], episode))
예제 #12
0
def experiment(variant):
    print('CUDA status:', torch.cuda.is_available())
    env = make_env(variant['env'])

    # Set seeds
    variant['seed'] = int(variant['seed'])
    env.seed(int(variant['seed']))
    torch.manual_seed(int(variant['seed']))
    np.random.seed(int(variant['seed']))

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    kwargs = {"state_dim": state_dim, "action_dim": action_dim, "max_action": max_action,
              "discount": variant['discount'], "tau": variant['tau'],
              'network_class': NETWORK_CLASSES[variant['network_class']]}

    # custom network kwargs
    mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                              hidden_dim=variant['hidden_dim'],
                              first_dim=variant['first_dim'])
    dropout_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                      hidden_dim=variant['hidden_dim'],
                                      first_dim=variant['first_dim'],
                                      dropout_p=variant['dropout_p'])
    variable_init_mlp_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                            hidden_dim=variant['hidden_dim'],
                                            first_dim=variant['first_dim'],
                                            sigma=variant['sigma'])
    fourier_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                  hidden_dim=variant['hidden_dim'],
                                  fourier_dim=variant['fourier_dim'],
                                  sigma=variant['sigma'],
                                  concatenate_fourier=variant['concatenate_fourier'],
                                  train_B=variant['train_B'])
    siren_network_kwargs = dict(n_hidden=variant['n_hidden'],
                                hidden_dim=variant['hidden_dim'],
                                first_omega_0=variant['omega'],
                                hidden_omega_0=variant['omega'])
    if variant['network_class'] in {'MLP', 'D2RL', 'ConcatMLP', 'SpectralMLP'}:
        kwargs['network_kwargs'] = mlp_network_kwargs
    elif variant['network_class'] == 'DropoutMLP':
        kwargs['network_kwargs'] = dropout_mlp_network_kwargs
    elif variant['network_class'] == 'VariableInitMLP':
        kwargs['network_kwargs'] = variable_init_mlp_network_kwargs
    elif variant['network_class'] in {'FourierMLP', 'LogUniformFourierMLP'}:
        kwargs['network_kwargs'] = fourier_network_kwargs
    elif variant['network_class'] == 'Siren':
        kwargs['network_kwargs'] = siren_network_kwargs
    else:
        raise NotImplementedError

    # Initialize policy
    if variant['policy'] == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = variant['policy_noise * max_action']
        kwargs["noise_clip"] = variant['noise_clip * max_action']
        kwargs["policy_freq"] = variant['policy_freq']
        policy = TD3.TD3(**kwargs)
    elif variant['policy'] == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif variant['policy'] == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif variant['policy'] == "SAC":
        kwargs['lr'] = variant['lr']
        kwargs['alpha'] = variant['alpha']
        kwargs['automatic_entropy_tuning'] = variant['automatic_entropy_tuning']
        kwargs['weight_decay'] = variant['weight_decay']
        # left out dmc
        policy = SAC(**kwargs)
    elif 'PytorchSAC' in variant['policy']:
        kwargs['action_range'] = [float(env.action_space.low.min()), float(env.action_space.high.max())]
        kwargs['actor_lr'] = variant['lr']
        kwargs['critic_lr'] = variant['lr']
        kwargs['alpha_lr'] = variant['alpha_lr']
        kwargs['weight_decay'] = variant['weight_decay']
        kwargs['no_target'] = variant['no_target']
        kwargs['mlp_policy'] = variant['mlp_policy']
        kwargs['mlp_qf'] = variant['mlp_qf']
        del kwargs['max_action']
        if variant['policy'] == 'PytorchSAC':
            policy = PytorchSAC(**kwargs)
        elif variant['policy'] == 'RandomNoisePytorchSAC':
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = RandomNoiseSACAgent(**kwargs)
        elif variant['policy'] == 'SmoothedPytorchSAC':
            kwargs['n_critic_samples'] = variant['n_critic_samples']
            kwargs['noise_dist'] = variant['noise_dist']
            kwargs['noise_scale'] = variant['noise_scale']
            policy = SmoothedSACAgent(**kwargs)
        elif variant['policy'] == 'FuncRegPytorchSAC':
            kwargs['critic_target_update_frequency'] = variant['critic_freq']
            kwargs['fr_weight'] = variant['fr_weight']
            policy = FuncRegSACAgent(**kwargs)
    else:
        raise NotImplementedError

    if variant['load_model'] != "":
        raise RuntimeError

    # load replay buffer
    replay_buffer = torch.load(os.path.join(variant['replay_buffer_folder'], 'generated_replay_buffer.pt'))

    policy_optimizer = torch.optim.Adam(policy.actor.parameters(), lr=variant['lr'])
    qf_optimizer = torch.optim.Adam(policy.critic.Q1.parameters(), lr=variant['lr'])

    # split into train and val for both action and q_value
    indices = np.arange(replay_buffer.max_size)
    random.shuffle(indices)
    train_indices = indices[:int(0.9 * len(indices))]
    val_indices = indices[int(0.9 * len(indices)):]
    train_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[train_indices]).float(),
                                                   torch.tensor(replay_buffer.action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.correct_action[train_indices]).float(),
                                                   torch.tensor(replay_buffer.q_value[train_indices]).float())
    val_dataset = torch.utils.data.TensorDataset(torch.tensor(replay_buffer.state[val_indices]).float(),
                                                 torch.tensor(replay_buffer.action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.correct_action[val_indices]).float(),
                                                 torch.tensor(replay_buffer.q_value[val_indices]).float())

    # train a network on it
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=variant['batch_size'], shuffle=True,
                                               pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=variant['batch_size'], shuffle=True,
                                             pin_memory=True)

    train_q_losses = []
    train_policy_losses = []
    val_q_losses = []
    val_policy_losses = []
    for _ in trange(variant['n_train_epochs']):
        total_q_loss = 0
        total_policy_loss = 0
        for (state, action, correct_action, q) in train_loader:
            state = state.to(DEVICE)
            action = action.to(DEVICE)
            correct_action = correct_action.to(DEVICE)
            q = q.to(DEVICE)
            q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
            policy_preds = policy.actor(state).mean
            q_loss = F.mse_loss(q_preds, q)
            policy_loss = F.mse_loss(policy_preds, correct_action)
            qf_optimizer.zero_grad()
            policy_optimizer.zero_grad()
            q_loss.backward()
            policy_loss.backward()
            qf_optimizer.step()
            policy_optimizer.step()
            total_q_loss += q_loss.item()
            total_policy_loss += policy_loss.item()

        # get validation stats
        total_val_q_loss = 0
        total_val_policy_loss = 0
        with torch.no_grad():
            for (state, action, correct_action, q) in val_loader:
                state = state.to(DEVICE)
                action = action.to(DEVICE)
                correct_action = correct_action.to(DEVICE)
                q = q.to(DEVICE)
                q_preds = policy.critic.Q1(torch.cat([state, action], dim=-1))
                policy_preds = policy.actor(state).mean
                q_loss = F.mse_loss(q_preds, q)
                policy_loss = F.mse_loss(policy_preds, correct_action)
                total_val_q_loss += q_loss.item()
                total_val_policy_loss += policy_loss.item()

        train_q_losses.append(total_q_loss / len(train_loader))
        train_policy_losses.append(total_policy_loss / len(train_loader))
        val_q_losses.append(total_val_q_loss / len(val_loader))
        val_policy_losses.append(total_val_policy_loss / len(val_loader))
        print(f'train: qf loss: {train_q_losses[-1]:.4f}, policy loss: {train_policy_losses[-1]:.4f}')
        print(f'val: qf loss: {val_q_losses[-1]:.4f}, policy loss: {val_policy_losses[-1]:.4f}')

    # evaluate the resulting policy for 100 episodes
    eval_return = eval_policy(policy, variant['env'], variant['seed'], eval_episodes=variant['eval_episodes'])

    # save the results
    to_save = dict(
        train_q_losses=train_q_losses,
        train_policy_losses=train_policy_losses,
        val_q_losses=val_q_losses,
        val_policy_losses=val_policy_losses,
        eval_return=eval_return,
        qf=policy.critic.Q1.state_dict(),
        policy=policy.actor.state_dict()
    )
    torch.save(to_save, os.path.join(variant['replay_buffer_folder'], f'{variant["network_class"]}_distillation.pt'))
예제 #13
0
# Environment
from fetch_env import fetch_env
env = UnityEnvironment(fetch_env(args.env, args.system))
default_brain = env.brain_names[0]
brain = env.brains[default_brain]

torch.manual_seed(args.seed)
np.random.seed(args.seed)

# Agent
num_worker = 11
state_dim = 1060
high = np.ones(39)
action_dim = spaces.Box(-high, high, dtype=np.float32)
agent = SAC(state_dim, action_dim, args)

#TesnorboardX
writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(
    datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env,
    args.policy, "autotune" if args.automatic_entropy_tuning else ""))

# Training Loop
total_numsteps = 0

agent_reward = np.zeros(num_worker)
buffer_reward = np.zeros(num_worker)
done = False
env_info = env.reset(train_mode=True)[default_brain]
states = env_info.vector_observations
예제 #14
0
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha,
              prefill_steps, discount, batch_size, learning_rate, tau, two_qf):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, alpha)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': batch_size,
        'discount': discount,
        'learning_rate': learning_rate,
        'reparameterize': True,
        'tau': tau,
        'epoch_length': ep_len,
        'n_epochs': n_iter,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': prefill_steps,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    q_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    policy_params = {
        'hidden_layer_sizes': (64, 64),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(
        name='value_function', **value_function_params)
    target_value_function = nn.ValueFunction(
        name='target_value_function', **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(
            env=env,
            policy=policy,
            q_function=q_function,
            q_function2=q_function2,
            value_function=value_function,
            target_value_function=target_value_function)

        for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
예제 #15
0
파일: test.py 프로젝트: karush17/esac
    epsilon_start = 1.0
    epsilon_final = 0.1
    epsilon_decay = 1200000

    epsilon_by_frame = lambda frame_idx: epsilon_final + (
        epsilon_start - epsilon_final) * math.exp(-1. * frame_idx /
                                                  epsilon_decay)

    # Worker Process Queues
    output_queue = mp.Queue(maxsize=args.pop)
    params_queue = mp.Queue(maxsize=args.pop)
    elite_queue = mp.Queue(maxsize=int(2 * args.pop))

    # Agent
    agent = SAC(STATE_DIM, ACTION_DIM, args)
    policy_checkpoint = torch.load(checkpoint_name + '/actor.pth.tar')
    agent.policy.load_state_dict(policy_checkpoint['model_state_dict'])
    sac_episodes = args.sac_episodes

    # Memory
    memory = ReplayMemory(args.replay_size)
    processes = []
    elite_list = []

    # Training Loop
    total_numsteps = 0
    updates = 0
    time_list = []
    max_rewards = []
    min_rewards = []
예제 #16
0
def run_session(db_name, max_session_length, sweep, session, model_name,
                params):

    alg = SAC(params)
    car = Car()
    car.reset()

    training_after_episodes = params["training_after_episodes"]

    episode = 0

    random_episodes = params["random_episodes"]

    max_episode_length = params["max_episode_length"]

    THROTTLE_MAX = params["throttle_max"]
    THROTTLE_MIN = params["throttle_min"]
    STEER_LIMIT_LEFT = -1
    STEER_LIMIT_RIGHT = 1

    action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, -1]),
                              high=np.array([STEER_LIMIT_RIGHT, 1]),
                              dtype=np.float32)

    for i in range(max_session_length):
        episode += 1
        throttle = 0.15
        try:
            step = 0

            state = car.reset()
            time.sleep(1)
            state = car.step([0, 0.01])
            #print(state)

            state = alg.process_image(state)
            state = np.stack((state, state, state, state), axis=0)
            episode_buffer = EpisodeBuffer(alg.horizon, alg.discount)
            episode_reward = 0

            while step < max_episode_length:

                t = time.time_ns()

                step += 1
                temp = state[np.newaxis, :]

                if episode < random_episodes:
                    action = action_space.sample()
                else:
                    action = alg.select_action(temp)
                    #action[1] = max(THROTTLE_MIN, min(THROTTLE_MAX, action[1]))
                    action[0] = max(STEER_LIMIT_LEFT,
                                    min(STEER_LIMIT_RIGHT, action[0]))

                throttle += action[1] / 100.0
                throttle = max(THROTTLE_MIN, min(THROTTLE_MAX, throttle))
                action[1] = throttle
                action[1] = 0.3

                next_state = car.step(action)

                im = next_state

                darkness = len(im[(im > 120) * (im < 130)])

                if darkness < 2500:  # < len(im[(im > 160) * (im < 170)]):
                    raise KeyboardInterrupt

                next_state = alg.process_image(next_state)
                reward = (throttle - THROTTLE_MIN) / (THROTTLE_MAX -
                                                      THROTTLE_MIN)

                reward = darkness / 7000

                image_to_ascii(next_state[::2].T)

                episode_reward += reward
                print(
                    "Sweep: {}, Episode: {}, Step: {}, Episode reward: {:.2f}, Step reward: {:.2f}"
                    .format(sweep, episode, step, episode_reward, reward))

                not_done = 1.0

                next_state = next_state[np.newaxis, :]
                next_state = np.vstack((state[:3, :, :], next_state))

                out = episode_buffer.add(
                    [state, action, [reward], next_state, [not_done]])

                last = [state, action, [reward], next_state, [not_done]]
                alg.push_buffer(last)

                #if out:
                #alg.push_buffer(out)

                state = next_state

                if len(alg.replay_buffer) > alg.batch_size:
                    alg.update_parameters()

                tn = time.time_ns()

                #sync with the network
                time.sleep(max(0, 0.1 - (tn - t) / 1e9))

            raise KeyboardInterrupt

        except KeyboardInterrupt:

            last[4] = [0]
            alg.push_buffer(last)

            car.reset()

            #if episode % 5 == 0:
            #print("Saving chekcpoint")
            #torch.save(alg, "sac_model_checkpoint.pth")
            print("Calculating reward")

            # episode_buffer = episode_buffer.as_list()

            # for i in range(len(episode_buffer)):
            #     reward = 0

            #     for j in range(min(len(episode_buffer) - i, alg.horizon)):
            #         reward += alg.discount**j * episode_buffer[i + j][2][0]

            #     norm = (1 - alg.discount**alg.horizon) / (1 - alg.discount)
            #     e = episode_buffer[i]
            #     e[2] = [reward / norm]
            #     if i == len(episode_buffer) - 1:
            #         e[-1][0] = 0.0

            #     alg.push_buffer(e)

            if len(alg.replay_buffer) > alg.batch_size:
                print("Training")
                for i in range(training_after_episodes):
                    alg.update_parameters()

            db.insert_episode(db_name, session, episode, step, episode_reward)

            time.sleep(5)
예제 #17
0
def readCSMNC(fileName):
    # open files
    fn = open(fileName, 'r')
    # magic number, ns, number of splited string
    # counted of the *.dat file download from CSMNC
    ns = 59
    tmp1 = fn.read()
    tmp2 = tmp1.split(maxsplit=59)
    tmpData = tmp2[59]
    # output the longitude and latitude of station to stationList.dat
    tmpLat = tmp2[18]
    tmpLong = tmp2[19]
    lat = tmpLat[0:6]
    lon = tmpLong[0:7]
    # magic number, td, duration time in seconds
    # pay attention to the type of variables!!!
    # td and step are floats
    # nps is an integer
    td = float(tmp2[52])
    # time intervals, = 0.005 SEC
    step = float(tmp2[43])
    # nps, number of data points
    nps = int(float(tmp2[38]))

    # start to distribute SAC head variables
    # component direction
    tmp2[30]
    # time interval
    delta = float(tmp2[43])
    # number of points
    npts = nps
    # station name
    stnm = tmp2[17]
    # instrument
    inst = tmp2[25]
    # compant name
    cmpNm = tmp2[30]
    # cmpaz, 方位角; cmpinc,倾角
    if (cmpNm == 'UD'):
        cmpaz = 0.0
        cmpinc = 0.0
    elif (cmpNm == 'EW'):
        cmpaz = 90.0
        cmpinc = 90.0
    elif (cmpNm == 'NS'):
        cmpaz = 0.0
        cmpinc = 90.0
    else:
        raise ValueError('wrong component direction', cmpNm)
    # event name
    evnm = tmp2[5]
    # origin time
    ot = str(2) + tmp2[1]
    nzyear = int(ot[0:4])
    mon = int(ot[4:6])
    day = int(ot[6:8])
    nzhour = int(ot[8:10])
    nzmin = int(ot[10:12])
    nzsec = int(ot[12:])
    nzjday = ymd2jday(nzyear, mon, day)
    nzmsec = 0
    # event latitude
    evla = float(tmp2[9][0:6])
    # event longitude
    evlo = float(tmp2[10][0:7])
    # event depth
    evdp = float(tmp2[12])
    # magnitude
    mag = float(tmp2[15][0:2])
    # latitude and longitude
    stla = lat
    stlo = lon
    iftype = 1
    # linspace default include start and stop
    # np.linspace(start, stop, number of samples)
    times = np.linspace(step, td, npts)
    tmp3 = tmpData.split(maxsplit=npts)
    tmp4 = np.asfarray(tmp3)
    # create null SAC object
    head = SAC()
    # set head variables
    head.set_ot(nzyear, nzjday, nzhour, nzmin, nzsec, nzmsec)
    head.set_magtyp(53)
    head.set_dep(8)
    head.set_ovrok(1)
    head.set_cmp(cmpaz, cmpinc)
    head.set_evdp(evdp)
    head.set_evla(evla)
    head.set_evlo(evlo)
    head.set_kevnm(evnm)
    head.set_mag(mag)
    head.set_kinst(inst)
    head.set_kstnm(stnm)
    head.set_stloc(stla, stlo)
    # set nessary head variables' values
    head.set_iftype(iftype)
    head.set_npts(npts)
    head.set_delta(delta)
    head.set_leven()
    head.set_be(step, td)
    head.set_nvhdr()
    # close all files
    fn.close()
    return head, tmp4
예제 #18
0
def main():
    parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
    parser.add_argument('--env-name', default="HalfCheetah-v2",
                        help='name of the environment to run')
    parser.add_argument('--policy', default="Gaussian",
                        help='algorithm to use: Gaussian | Deterministic')
    parser.add_argument('--eval', type=bool, default=True,
                        help='Evaluates a policy a policy every 10 episode (default:True)')
    parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
                        help='discount factor for reward (default: 0.99)')
    parser.add_argument('--tau', type=float, default=0.005, metavar='G',
                        help='target smoothing coefficient(τ) (default: 0.005)')
    parser.add_argument('--lr', type=float, default=0.0003, metavar='G',
                        help='learning rate (default: 0.0003)')
    parser.add_argument('--alpha', type=float, default=0.2, metavar='G',
                        help='Temperature parameter α determines the relative importance of the entropy term against the reward (default: 0.2)')
    parser.add_argument('--automatic_entropy_tuning', type=bool, default=False,
                        metavar='G',
                        help='Temperature parameter α automaically adjusted.')
    parser.add_argument('--seed', type=int, default=456, metavar='N',
                        help='random seed (default: 456)')
    parser.add_argument('--batch_size', type=int, default=256, metavar='N',
                        help='batch size (default: 256)')
    parser.add_argument('--num_steps', type=int, default=2000001, metavar='N',
                        help='maximum number of steps (default: 2000000)')
    parser.add_argument('--hidden_size', type=int, default=256, metavar='N',
                        help='hidden size (default: 256)')
    parser.add_argument('--updates_per_step', type=int, default=1, metavar='N',
                        help='model updates per simulator step (default: 1)')
    parser.add_argument('--start_steps', type=int, default=10000, metavar='N',
                        help='Steps sampling random actions (default: 10000)')
    parser.add_argument('--target_update_interval', type=int, default=1,
                        metavar='N',
                        help='Value target update per no. of updates per step (default: 1)')
    parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                        help='size of replay buffer (default: 10000000)')
    parser.add_argument('--cuda', action="store_true",
                        help='run on CUDA (default: False)')
    parser.add_argument('--resume-name', default=None,
                        help='Name of saved model to load')
    args, unknown = parser.parse_known_args()

    # Import custom envs
    import gym_match_input_continuous
    import deepdrive_2d

    # TesnorboardX
    run_name = '{}_SAC_{}_{}_{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"),
        args.env_name,
        args.policy,
        "autotune" if args.automatic_entropy_tuning else "")

    # Log to file
    os.makedirs('logs', exist_ok=True)
    log.add(f'logs/{run_name}.log')

    log.info(' '.join(sys.argv))

    # Environment
    # env = NormalizedActions(gym.make(args.env_name))
    env = gym.make(args.env_name)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)
    env.reset()

    # Agent
    agent = SAC(env.observation_space.shape[0], env.action_space, args)
    if args.resume_name:
        agent.load_model(f'{DIR}/models/sac_actor_runs/{args.resume_name}',
                         f'{DIR}/models/sac_critic_runs/{args.resume_name}')



    run_name = 'runs/' + run_name

    writer = SummaryWriter(logdir=run_name)

    # Memory
    memory = ReplayMemory(args.replay_size)

    train(agent, args, env, memory, run_name, writer)
    env.close()
예제 #19
0
                        dest="continue_training",
                        help="Continue training from a checkpoint")
    parser.add_argument("-r",
                        "--render_testing",
                        action="store_true",
                        default=True,
                        dest="render_testing",
                        help="Render window when testing agent.")
    parser.add_argument("-n",
                        "--num_test_games",
                        action="store",
                        default=1,
                        type=int,
                        dest="num_test_games",
                        help="How many games to play when testing.")

    parser.add_argument("--version",
                        action="version",
                        version="PyTorch-SAC Version 0.1")

    args = parser.parse_args()

    sac = SAC(env_name=args.env_name,
              data_save_dir=os.path.join("runs", args.log_dir))
    if not args.test:
        sac.train(resume_training=args.continue_training)
    else:
        sac.test(render=args.render_testing,
                 use_internal_policy=False,
                 num_games=args.num_test_games)
예제 #20
0
env.get_observation_space_size = new_get_observation_space_size

env.observation_space = ([0] * env.get_observation_space_size(),
                         [0] * env.get_observation_space_size())
env.observation_space = convert_to_gym(env.observation_space)

# Create log dir for callback model saving
os.makedirs("./temp_models/", exist_ok=True)
env = Monitor(env, "./temp_models/", allow_early_resets=True)

##### TRAIN #####

if args.train:
    check_overwrite(args.model)
    model = SAC(MlpPolicy,
                env,
                verbose=1,
                tensorboard_log="./tensorboard_log/")
    model.learn(total_timesteps=int(args.step),
                log_interval=10,
                tb_log_name="log",
                callback=callback.callback)
    model.save(MODELS_FOLDER_PATH)

#### TEST #####

if not args.train:
    model = SAC.load(MODELS_FOLDER_PATH)
    obs = env.reset()
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(scale_range(action, -1, 1, 0, 1))
예제 #21
0
import gym
from sac import SAC

from utils.sac_runner import vector_train
from utils.sac_runner import evaluate

if __name__ == "__main__":
    env = gym.vector.make("Pendulum-v0", num_envs=4, asynchronous=True)
    actor = SAC(env.single_observation_space,
                env.single_action_space,
                p_lr=1e-3,
                q_lr=1e-3)

    returns = vector_train(actor, env, 40000, -200)

    eval_env = gym.make("Pendulum-v0")
    evaluate(actor, eval_env, 1, True)
예제 #22
0
memory_discount = 0.95
memory_horizon = 1

sac_params = {
    "linear_output": sac_input,
    "lr": 0.0003,
    "target_entropy": -2,
    "batch_size": 64,
    "hidden_size": 128
}

# Create the controller for the Donkey env
env = Car("kari_main", "mqtt.eclipse.org")
env.reset()
# Create the SAC agent to control the env
agent = SAC(parameters=sac_params)
# Create the state representation functionality

if input("Load model?"):
    agent = torch.load("model.pth")

throttle_weight_1 = 0.1
throttle_weight_2 = -5

STEER_LIMIT_LEFT = -1
STEER_LIMIT_RIGHT = 1
THROTTLE_MAX = 0.23
THROTTLE_MIN = 0.15
MAX_STEERING_DIFF = 0.5

action_space = spaces.Box(low=np.array([STEER_LIMIT_LEFT, THROTTLE_MIN]),
예제 #23
0
from gym import spaces

from functions import process_image, image_to_ascii, rgb2gray

from episode_buffer import EpisodeBuffer

params = {
    "target_entropy": -4,
    "hidden_size": 64,
    "batch_size": 64,
    "discount": 0.95,
    "lr": 0.0001
}

alg = SAC(parameters=params)
car = Car(car="kari_main")
car.reset()

## SAC hyperparameters

## Other hyperparameters

training_after_episodes = 1

episode = 0
random_episodes = 5

cmd = input(
    "If you want to load a model, give model path, default last checkpoint.")
if cmd != "":
예제 #24
0
def train_rnn_sac(path, threshold=0.02):
    env = gym.make('ActivePerception-v0')
    rnn = RNNFilter().to(device)
    rnn.load_model(path)
    sac = SAC()

    # set up the experiment folder
    experiment_id = "rsac_" + get_datetime()
    save_path = CKPT + experiment_id + ".pt"

    max_frames = 100000
    frame_idx = 0
    best_loss = np.inf
    pbar = tqdm(total=max_frames)
    stats = {'losses': []}
    best_reward = 0
    avg_reward = 0
    avg_mse = 0
    episode = 0
    while frame_idx < max_frames:
        pbar.update(1)

        episode += 1
        env.sid = env.sid % 9900
        scene_data, obs = env.reset(False)

        S, A, R, D = [], [], [], []
        s = get_state(scene_data).to(device)  # [1, n_obj, dim_obj] state
        o = trans_rgb(obs['o']).to(device)  # [1, C, H, W]        rgb
        d = trans_d(obs['d']).to(device)  # [1, 1, H, W]        depth

        s_, h = rnn(o, d)
        prev_mse = F.mse_loss(s_, s).item()
        h_numpy = h.view(-1).detach().cpu().numpy()
        S.append(h_numpy)
        for _ in range(7):
            frame_idx += 1
            th = sac.policy_net.get_action(h_numpy.reshape(1, -1))
            obs = env.step(th.item())
            o = trans_rgb(obs['o']).to(device)
            d = trans_d(obs['d']).to(device)
            th = torch.FloatTensor([th]).view(1, -1).to(device)
            s_, h = rnn(o, d, th, h)

            mse = F.mse_loss(s_, s).item()
            #r        = (mse - prev_mse)*100
            prev_mse = mse
            d = (mse < threshold)
            r = 8 if d else -1
            h_numpy = h.view(-1).detach().cpu().numpy()

            S.append(h_numpy)
            A.append(th.cpu().numpy().reshape(-1))
            R.append(r)
            D.append(d)
            if d:
                break

        S, NS = S[:-1], S[1:]
        for s, a, r, ns, d in zip(S, A, R, NS, D):
            sac.replay_buffer.push(s, a, r, ns, d)
            if len(sac.replay_buffer) > batch_size:
                sac.soft_q_update(batch_size)

        avg_reward += np.array(R).sum()
        avg_mse += prev_mse
        if episode % 10 == 0:
            avg_reward /= 10
            avg_mse /= 10
            tqdm.write("[INFO] epi %05d | avg r: %10.4f | avg mse: %10.4f" %
                       (episode, avg_reward, avg_mse))
            if avg_reward > best_reward:
                best_reward = avg_reward
                sac.save_model(save_path)
            avg_reward = 0
            avg_mse = 0
예제 #25
0
파일: main.py 프로젝트: Kellsky/d2rl
        env = gym.make(args.env_name, reward_type='dense')
    else:
        env = gym.make(args.env_name)
    test_env = gym.make(args.env_name)

args.cuda = True if torch.cuda.is_available() else False

# Agent
if args.gcp:
    obs_space = env.observation_space['desired_goal'].shape[0] + \
            env.observation_space['observation'].shape[0]
else:
    obs_space = env.observation_space.shape[0]

args.automatic_entropy_tuning = True
agent = SAC(obs_space, env.action_space, args)

# Memory
memory = ReplayMemory(args.replay_size)

# Training Loop
total_numsteps = 0
updates = 0

for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()
    if args.gcp:
        goal = state['desired_goal']
예제 #26
0
def main():
    """
    The main file of the project
    """

    # args and warnings ignoring setup
    simplefilter(action="ignore")
    parser = build_argparser()
    args = parser.parse_args()

    # environment setup
    env = NormalizedActions(gym.make(
        ENV_NAME))  # to ensure actions in [-1, 1] get correctly translated
    # setting libraries seeds to try and have repeatability
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    env.seed(args.seed)

    # agent setup
    agent = SAC(env.observation_space, env.action_space, args)
    agent.load_networks_parameters(args.load_params)

    # if verbose, print a tabular recap of the args passed via command-line (or default ones)
    if args.verbose >= 1:
        t = Texttable()
        t.set_cols_dtype(['t', 'e'])
        t.add_rows([["Argument", "Value"]] + [[arg, getattr(args, arg)]
                                              for arg in vars(args)] +
                   [["device", agent.device]])
        print(t.draw())
        print("\nSetup completed. Settings shown in the table above.")

    # training
    if args.train:
        input("\nPress any key to begin training.")
        try:
            train(env, agent, args)
        except KeyboardInterrupt:
            # to stop training
            print("\nInterrupt received.")
        except Exception:
            # if anything else happens, catch the exception and print it but without crashing
            traceback.print_exc()
        finally:
            print("\nTraining terminated.")
            # if required to save parameters, or need them for later testing, save them
            if args.save_params or args.test:
                global PARAMS_DIR
                PARAMS_DIR = agent.save_networks_parameters(
                    args.save_params_dir)

            # save the plot that has been generated so far, if any
            if args.plot:
                save_plot()

            # close the environment
            env.close()

    # testing
    if args.test:
        try:
            # build environment and agent
            env = NormalizedActions(gym.make(ENV_NAME))
            agent = SAC(env.observation_space, env.action_space, args)

            if PARAMS_DIR is None:
                # then look if the user has specified a directory for loading parameters
                if args.load_params is None:
                    # then the agent will not load any parameters and will therefore act purely random
                    print("WARNING: Testing a random agent.")
                else:
                    PARAMS_DIR = args.load_params
                    print("Using selected parameters.")
            else:
                print("Using training parameters.")

            # initialize agent's networks' parameters
            agent.load_networks_parameters(PARAMS_DIR)

            input("\nPress any key to begin testing.")
            test(env, agent, args)
        except KeyboardInterrupt:
            # to stop testing
            print("\nInterrupt received.")
        except Exception:
            # if anything else happens, catch the exception and print it but without crashing
            traceback.print_exc()
        finally:
            print("\nTesting terminated.")
            # save the plot that has been generated so far, if any
            if args.plot:
                save_plot()

            # close the environment
            env.close()
parser.add_argument('--cuda', action="store_true",
                    help='run on CUDA (default: False)')

args = parser.parse_args()

# Environment
# env = NormalizedActions(gym.make(args.env_name))
env = gym.make(args.env_name)
env.seed(args.seed)
env.action_space.seed(args.seed)

torch.manual_seed(args.seed)
np.random.seed(args.seed)

# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

#Tesnorboard
path = 'runs/{}_SAC_{}_{}_{}_seed_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name,
                                             args.policy, "autotune" if args.automatic_entropy_tuning else "", args.seed)
writer = SummaryWriter(path)


# Memory
memory = ReplayMemory(args.replay_size, args.seed)

# Training Loop
total_numsteps = 0
updates = 0

for i_episode in itertools.count(0):
예제 #28
0
def runner(env,
           actor_path,
           critic_path,
           timesteps_per_batch,
           number_trajs,
           stochastic_policy,
           save=False,
           reuse=False,
           args=None,
           render=True):
    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    # initial network
    pi = SAC(ob_space.shape[0], ac_space, args)
    # pi = policy_func("pi", ob_space, ac_space, reuse=reuse)
    # U.initialize()

    # Prepare for rollouts load model
    # ----------------------------------------
    pi.load_model(actor_path=actor_path, critic_path=critic_path)
    # u.load_variables(load_model_path)

    obs_list = []
    obs1_list = []
    acs_list = []
    reward_list = []
    done_list = []
    episode_len_list = []
    episode_return_list = []
    current_traj_num = 0
    current_abandon_traj_num = 0
    while current_traj_num <= number_trajs:
        traj = traj_1_generator(pi,
                                env,
                                timesteps_per_batch,
                                render,
                                stochastic=stochastic_policy)
        if traj['ep_len'] < timesteps_per_batch:
            current_abandon_traj_num += 1
            print("abandon episode number:{}!, episode len:{}".format(
                current_abandon_traj_num, traj['ep_len']))
            continue
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[
            'ep_len'], traj['ep_ret']
        obs1, reward, done = traj['obs1'], traj['rew'], traj['new']

        # append multi dimension array
        obs_list.append(obs)
        obs1_list.append(obs1)
        acs_list.append(acs)
        episode_len_list.append(ep_len)
        episode_return_list.append(ep_ret)
        done_list.append(done)
        reward_list.append(reward)
        current_traj_num += 1
        if current_traj_num % 1 == 0:  # control the print frequency
            print("accept episode number:{}, len:{}, returns:{}".format(
                current_traj_num, ep_len, ep_ret))
        if current_traj_num >= number_trajs:
            break

    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        # Assemble the file name
        file_path = 'gather_expert_demonstration/expert_demonstration_data/model_guide/'
        file_name = 'stochastic' if stochastic_policy else 'deterministic' + '_SAC_' \
                                                           + env.spec.id + "(6000)" + '_johnny'
        path = osp.join(file_path, file_name)
        # Save the gathered data collections to the filesystem
        np.savez(path,
                 obs=np.array(obs_list),
                 acs=np.array(acs_list),
                 lens=np.array(episode_len_list),
                 returns=np.array(episode_return_list),
                 done=np.array(done_list),
                 reward=np.array(reward_list))
        print("saving demonstrations")
        print("  @: {}.npz".format(path))
        # save expert data for contrast algorithm sam
        # Assemble the file name
        file_path = 'gather_expert_demonstration/expert_demonstration_data/sam/'
        file_name = 'stochastic' if stochastic_policy else 'deterministic' + '_SAC_' \
                                                           + env.spec.id + "(6000)" + '_sam'
        path = osp.join(file_path, file_name)
        np.savez(path,
                 obs0=np.array(obs_list),
                 acs=np.array(acs_list),
                 env_rews=np.array(reward_list),
                 dones1=np.array(done_list),
                 obs1=np.array(obs1_list),
                 ep_lens=np.array(episode_len_list),
                 ep_env_rets=np.array(episode_return_list))
        print("saving demonstrations")
        print("  @: {}.npz".format(path))

    avg_len = sum(episode_len_list) / len(episode_len_list)
    avg_ret = sum(episode_return_list) / len(episode_return_list)
    print("Average length:", avg_len)
    print("Average return:", avg_ret)
    return avg_len, avg_ret
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct,
              logdir, debug, gpu):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparametrize,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        old_funct=old_funct,
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)
    with tf.Session(config=tf_config) as sess:

        if debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)

        for epoch in algorithm.train(sampler,
                                     session=sess,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
예제 #30
0
args = parser.parse_args()

wandb.init(name=f"{args.env_name}-HERLoaded", project="MyExp")
# Environment
env = gym.make(args.env_name)

env.seed(args.seed)

torch.manual_seed(args.seed)
np.random.seed(args.seed)

# # Agent
if args.env_name.startswith('Fetch'):
    env_space = env.observation_space.spaces
    agent = SAC(
        env_space['observation'].shape[0] + env_space['desired_goal'].shape[0],
        env.action_space, args)

else:
    agent = SAC(env.observation_space.shape[0] + 2, env.action_space, args)

# Memory
memory = ReplayGMemory(args.replay_size, args.seed)

# Training Loop
total_numsteps = 0
updates = 0
did_it = False
for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0