Python ReplayBuffer.reset примеры использования

Язык программирования: Python

Пространство имен/Пакет: memory

Класс/Тип: ReplayBuffer

Метод/Функция: reset

Примеров на hotexamples.com: 3

Python ReplayBuffer.reset - 3 примера найдено. Это лучшие примеры Python кода для memory.ReplayBuffer.reset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReplayBuffer(30)

add(30)

sample(30)

store_transition(6)

sample_buffer(6)

store_frame(5)

append(4)

store_effect(4)

size(4)

encode_recent_observation(4)

push(3)

sample_batch(3)

reset(3)

update_distribution(2)

sample_distribution(2)

remember(2)

load_state_dict(2)

batch_passed(2)

can_sample(2)

Transition(1)

put(1)

load(1)

get_samples(1)

empty(1)

buffer_sample(1)

buffer_add(1)

save(1)

batch_sample(1)

append_batch(1)

anneal_beta(1)

torch_samples(1)

__len__(1)

update_priorities(1)

Пример #1

Показать файл

Файл: joint2.py Проект: rbcommits/RobotLearning

def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=-1,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 0, 0, 0]),
                                initialAngles=np.array([0, 45, 0, 0, 0, 0, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionRange = env.action_range()
    actionSpace = DiscreteSpace(
        intervals=[15 for i in range(2)] + [1],
        ranges=[actionRange[1], actionRange[2], actionRange[7]])
    processor = JointProcessor(actionSpace)

    # create the model and policy functions
    modelFn = MxFullyConnected(sizes=[stateSpace.n + actionSpace.n, 64, 32, 1],
                               alpha=0.001,
                               use_gpu=True)
    if args.load_params:
        print("loading params...")
        modelFn.load_params(args.load_params)

    softmax = lambda s: np.exp(s) / np.sum(np.exp(s))
    policyFn = EpsilonGreedyPolicy(
        epsilon=0.5,
        getActionsFn=lambda state: actionSpace.sample(1024),
        distributionFn=lambda qstate: softmax(modelFn(qstate)))
    dataset = ReplayBuffer()
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done:
            if stopsig:
                break
            action = policyFn(state)
            nextState, reward, done, info = env.step(
                createAction(processor.process_env_action(action)))
            dataset.append(state, action, reward, nextState)
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        dataset.reset()  # push trajectory into the dataset buffer
        modelFn.fit(processor.process_Q(dataset.sample(1024)), num_epochs=10)
        print("Reward:", reward if (reward >= 0.00001) else 0, "with Error:",
              modelFn.score(), "with steps:", steps)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(reward) + ", " +
                      str(modelFn.score()) + "]\n")

        rollout += 1
        if rollout % 100 == 0:
            policyFn.epsilon *= 0.95
            print("Epsilon is now:", policyFn.epsilon)

    if args.logfile:
        log.close()
    if args.save_params:
        print("saving params...")
        modelFn.save_params(args.save_params)

Пример #2

Показать файл

Файл: power.py Проект: timrobot/ArmRL

def main():
    # define arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--render",
                        action="store_true",
                        help="Render the state")
    parser.add_argument("--render_interval",
                        type=int,
                        default=10,
                        help="Number of rollouts to skip before rendering")
    parser.add_argument("--num_rollouts",
                        type=int,
                        default=1000,
                        help="Number of max rollouts")
    parser.add_argument("--logfile",
                        type=str,
                        help="Indicate where to save rollout data")
    parser.add_argument(
        "--load_params",
        type=str,
        help="Load previously learned parameters from [LOAD_PARAMS]")
    parser.add_argument("--save_params",
                        type=str,
                        help="Save learned parameters to [SAVE_PARAMS]")
    parser.add_argument("--gamma",
                        type=float,
                        default=0.99,
                        help="Discount factor")
    parser.add_argument("--test", action="store_true", help="Test the params")
    args = parser.parse_args()

    signal.signal(signal.SIGINT, stopsigCallback)
    global stopsig

    # create the basketball environment
    env = BasketballVelocityEnv(fps=60.0,
                                timeInterval=0.1,
                                goal=[0, 5, 0],
                                initialLengths=np.array([0, 0, 1, 1, 1, 0, 1]),
                                initialAngles=np.array(
                                    [0, 45, -20, -20, 0, -20, 0]))

    # create space
    stateSpace = ContinuousSpace(ranges=env.state_range())
    actionSpace = ContinuousSpace(ranges=env.action_range())

    # create the model and policy functions
    modelFn = PoWERDistribution(stateSpace.n,
                                actionSpace.n,
                                sigma=5.0 if not args.test else 0)
    if args.load_params:
        print("Loading params...")
        modelFn.load_params(args.load_params)

    replayBuffer = ReplayBuffer(1024)
    if args.logfile:
        log = open(args.logfile, "a")

    rollout = 0
    while args.num_rollouts == -1 or rollout < args.num_rollouts:
        print("Iteration:", rollout)
        state = env.reset()
        reward = 0
        done = False
        steps = 0
        while not done and steps < 5:
            if stopsig:
                break
            action, eps = modelFn.predict(
                state, replayBuffer.sample(gamma=args.gamma))
            if steps == 4:
                action[-1] = 1.0
            nextState, reward, done, info = env.step(action)
            replayBuffer.append(state,
                                action,
                                reward,
                                nextState=nextState,
                                info={"eps": eps})
            state = nextState
            steps += 1
            if args.render and rollout % args.render_interval == 0:
                env.render()
        if stopsig:
            break

        # no importance sampling, implement it when we have small datasets
        replayBuffer.reset()
        dataset = replayBuffer.sample(gamma=args.gamma)
        modelFn.fit(dataset)

        avgR = np.sum(dataset["rewards"]) / float(len(dataset["rewards"]))
        avgQ = np.sum(dataset["values"]) / float(len(dataset["values"]))
        print("Rollouts:", rollout, "Error:", modelFn.score(), "Average Q",
              avgQ, "Average R", avgR)
        if args.logfile:
            log.write("[" + str(rollout) + ", " + str(modelFn.score()) + ", " +
                      str(avgQ) + ", " + str(avgR) + "]\n")
        rollout += 1

    if args.logfile:
        log.close()
    if args.save_params:
        print("Saving params...")
        modelFn.save_params(args.save_params)

Пример #3

Показать файл

def train(config_filepath, save_dir, device, visualize_interval):
    conf = load_toml_config(config_filepath)
    data_dir, log_dir = create_save_dir(save_dir)
    # Save config file
    shutil.copyfile(config_filepath,
                    os.path.join(save_dir, os.path.basename(config_filepath)))
    device = torch.device(device)

    # Set up log metrics
    metrics = {
        'episode': [],
        'episodic_step': [],
        'collected_total_samples': [],
        'reward': [],
        'q_loss': [],
        'policy_loss': [],
        'alpha_loss': [],
        'alpha': [],
        'policy_switch_epoch': [],
        'policy_switch_sample': [],
        'test_episode': [],
        'test_reward': [],
    }

    policy_switch_samples = conf.policy_switch_samples if hasattr(
        conf, "policy_switch_samples") else None
    total_collected_samples = 0

    # Create environment
    env = make_env(conf.environment, render=False)

    # Instantiate modules
    memory = ReplayBuffer(int(conf.replay_buffer_capacity),
                          env.observation_space.shape, env.action_space.shape)
    agent = getattr(agents, conf.agent_type)(env.observation_space,
                                             env.action_space,
                                             device=device,
                                             **conf.agent)

    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        policy_switch_samples = ckpt['policy_switch_samples']
        total_collected_samples = ckpt['total_collected_samples']

    def save_checkpoint():
        # Save checkpoint
        ckpt = {
            'metrics': metrics,
            'agent': agent.state_dict(),
            'memory': memory.state_dict(),
            'policy_switch_samples': policy_switch_samples,
            'total_collected_samples': total_collected_samples
        }
        path = os.path.join(data_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(data_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(data_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(
        metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    agent_update_count = 0
    for episode in pbar:
        episodic_reward = 0
        o = env.reset()
        q1_loss, q2_loss, policy_loss, alpha_loss, alpha = None, None, None, None, None

        for t in range(conf.horizon):
            if total_collected_samples <= conf.random_sample_num:  # Select random actions at the begining of training.
                h = env.action_space.sample()
            elif memory.step <= conf.random_sample_num:  # Select actions from random latent variable soon after inserting a new subpolicy.
                h = agent.select_action(o, random=True)
            else:
                h = agent.select_action(o)

            a = agent.post_process_action(
                o, h)  # Convert abstract action h to actual action a

            o_next, r, done, _ = env.step(a)
            total_collected_samples += 1
            episodic_reward += r
            memory.push(o, h, r, o_next, done)
            o = o_next

            if memory.step > conf.random_sample_num:
                # Update agent
                batch_data = memory.sample(conf.agent_update_batch_size)
                q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(
                    batch_data, agent_update_count)
                agent_update_count += 1

            if done:
                break

        # Describe and save episodic metrics
        reward_moving_avg = (
            1. - MOVING_AVG_COEF
        ) * reward_moving_avg + MOVING_AVG_COEF * episodic_reward if reward_moving_avg else episodic_reward
        pbar.set_description(
            "EPISODE {} (total samples {}, subpolicy samples {}) --- Step {}, Reward {:.1f} (avg {:.1f})"
            .format(episode, total_collected_samples, memory.step, t,
                    episodic_reward, reward_moving_avg))
        metrics['episode'].append(episode)
        metrics['reward'].append(episodic_reward)
        metrics['episodic_step'].append(t)
        metrics['collected_total_samples'].append(total_collected_samples)
        if episode % visualize_interval == 0:
            # Visualize metrics
            lineplot(metrics['episode'][-len(metrics['reward']):],
                     metrics['reward'], 'REWARD', log_dir)
            reward_avg = np.array(metrics['reward']) / np.array(
                metrics['episodic_step'])
            lineplot(metrics['episode'][-len(reward_avg):], reward_avg,
                     'AVG_REWARD', log_dir)
            lineplot(
                metrics['collected_total_samples'][-len(metrics['reward']):],
                metrics['reward'],
                'SAMPLE-REWARD',
                log_dir,
                xaxis='sample')

        # Save metrics for agent update
        if q1_loss is not None:
            metrics['q_loss'].append(np.mean([q1_loss, q2_loss]))
            metrics['policy_loss'].append(policy_loss)
            metrics['alpha_loss'].append(alpha_loss)
            metrics['alpha'].append(alpha)
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q_loss']):],
                         metrics['q_loss'], 'Q_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):],
                         metrics['policy_loss'], 'POLICY_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):],
                         metrics['alpha_loss'], 'ALPHA_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):],
                         metrics['alpha'], 'ALPHA', log_dir)

        # Insert new subpolicy layer and reset memory if a specific amount of samples is collected
        if policy_switch_samples and len(
                policy_switch_samples
        ) > 0 and total_collected_samples >= policy_switch_samples[0]:
            print(
                "----------------------\nInser new policy\n----------------------"
            )
            agent.insert_subpolicy()
            memory.reset()
            metrics['policy_switch_epoch'].append(episode)
            metrics['policy_switch_sample'].append(total_collected_samples)
            policy_switch_samples = policy_switch_samples[1:]

        # Test a policy
        if episode % conf.test_interval == 0:
            test_rewards = []
            for _ in range(conf.test_times):
                episodic_reward = 0
                obs = env.reset()
                for t in range(conf.horizon):
                    h = agent.select_action(obs, eval=True)
                    a = agent.post_process_action(o, h)
                    obs_next, r, done, _ = env.step(a)
                    episodic_reward += r
                    obs = obs_next

                    if done:
                        break

                test_rewards.append(episodic_reward)

            test_reward_avg, test_reward_std = np.mean(test_rewards), np.std(
                test_rewards)
            print("   TEST --- ({} episodes) Reward {:.1f} (pm {:.1f})".format(
                conf.test_times, test_reward_avg, test_reward_std))
            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(test_rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):],
                     metrics['test_reward'], "TEST_REWARD", log_dir)

        # Save checkpoint
        if episode % conf.checkpoint_interval:
            save_checkpoint()

    # Save the final model
    torch.save({'agent': agent.state_dict()},
               os.path.join(data_dir, 'final_model.pth'))