Python MADDPG.reset примеры использования

Язык программирования: Python

Пространство имен/Пакет: maddpg

Класс/Тип: MADDPG

Метод/Функция: reset

Примеров на hotexamples.com: 5

Python MADDPG.reset - 5 примеров найдено. Это лучшие примеры Python кода для maddpg.MADDPG.reset, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MADDPG(30)

act(26)

update_targets(12)

update(9)

reset(5)

step(4)

learn(4)

save(4)

test(3)

choose_action(2)

run(2)

update_policy(2)

train(2)

store_transition(1)

take_actions(1)

select_action_test(1)

select_action(1)

save_weights(1)

save_model(1)

save_checkpoint(1)

actors_target(1)

reset_ounoise(1)

actor_optimizer(1)

load_model(1)

load_from_file(1)

load_critic(1)

load_checkpoint(1)

load_actor(1)

load(1)

init_from_save(1)

init_from_env(1)

critics_target(1)

critic_optimizer(1)

create_agents(1)

load_weights(1)

Пример #1

Показать файл

def main(arglist):
    ACTORS = 1
    env = EnvWrapper(arglist.scenario, ACTORS, arglist.saved_episode)
    if arglist.eval:
        current_time = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        writer = SummaryWriter(log_dir='./logs/' + current_time + '-' +
                               arglist.scenario)
    maddpg_wrapper = MADDPG(ACTORS)

    maddpg_wrapper.create_agents(env, arglist)

    j = 0
    for episode in range(arglist.max_episode):
        obs = env.reset()
        terminal = False
        maddpg_wrapper.reset()
        total_reward = [0 for i in maddpg_wrapper.workers]
        step = 0

        while not terminal and step < 25:
            if not arglist.eval:
                env.render(0)
                time.sleep(0.03)

            actions = maddpg_wrapper.take_actions(obs)
            obs2, reward, done = env.step(actions)

            for actor in range(ACTORS):
                for i, rew in enumerate(reward[actor]):
                    total_reward[i] += rew

            j += ACTORS
            #terminal = all(done)
            if arglist.eval:
                maddpg_wrapper.update(j, ACTORS, actions, reward, obs, obs2,
                                      done)

            obs = obs2
            step += 1

        if arglist.eval and episode % arglist.saved_episode == 0 and episode > 0:
            maddpg_wrapper.save(episode)

        if arglist.eval:
            for worker, ep_ave_max in zip(maddpg_wrapper.workers,
                                          maddpg_wrapper.ep_ave_max_q_value):
                print(worker.pos, ' => average_max_q: ',
                      ep_ave_max / float(step), ' Reward: ',
                      total_reward[worker.pos], ' Episode: ', episode)
                writer.add_scalar(
                    str(worker.pos) + '/Average_max_q',
                    ep_ave_max / float(step), episode)
                writer.add_scalar(
                    str(worker.pos) + '/Reward Agent',
                    total_reward[worker.pos], episode)

    env.close()

Пример #2

Показать файл

Файл: monitor.py Проект: Ullar-Kask/Udacity-DRLND

def train(env, num_episodes=5000, max_t=1000, warmup_episodes=0):
    """ Monitor agent's performance.
    
    Params
    ======
    - env: instance of the environment
    - num_episodes: maximum number of episodes of agent-environment interaction
    - max_t: maximum number of timesteps per episode
    - warmup_episodes: how many episodes to explore and collect samples before learning begins
    
    Returns
    =======
    - scores: list containing received rewards
    """

    # get the default brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]

    # reset the environment
    env_info = env.reset(train_mode=True)[brain_name]

    # number of agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)

    # number of actions
    action_size = brain.vector_action_space_size
    print('Number of actions:', action_size)

    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 1.0
    noise_reduction = 0.9999

    # list containing max scores from each episode
    episode_scores = []
    # last 100 scores
    scores_window = deque(maxlen=100)
    mean_score = 0.0

    maddpg = MADDPG(state_size, action_size, num_agents * state_size,
                    num_agents * action_size)

    # for each episode
    for i_episode in range(1, num_episodes + 1):
        # reset the environment and begin the episode
        env_info = env.reset(train_mode=True)[brain_name]
        maddpg.reset()

        # get the current state (for each agent)
        states = env_info.vector_observations

        # initialize the score (for each agent)
        scores = np.zeros(num_agents)

        for t in range(max_t):
            # select an action (for each agent)
            if i_episode > warmup_episodes:
                actions = maddpg.act(states, noise)
                noise *= noise_reduction
            else:
                # Collect random samples to explore and fill the replay buffer
                actions = np.random.uniform(-1, 1, (num_agents, action_size))

            # send all actions to the environment
            env_info = env.step(actions)[brain_name]

            # get next state (for each agent)
            next_states = env_info.vector_observations

            # get reward (for each agent)
            rewards = env_info.rewards

            # see if episode finished
            dones = env_info.local_done

            # agents perform internal updates based on sampled experience
            maddpg.step(states, actions, rewards, next_states, dones)

            # roll over states to next time step
            states = next_states

            # learn when time is right
            if t % LEARN_EVERY == 0 and i_episode > warmup_episodes:
                for _ in range(LEARN_BATCH):
                    maddpg.learn()

            # update the score (for each agent)
            scores += rewards

            # exit loop if episode finished
            if np.any(dones):
                break

        episode_max_score = np.max(scores)
        episode_scores.append(episode_max_score)

        if i_episode > warmup_episodes:
            # save final score
            scores_window.append(episode_max_score)
            mean_score = np.mean(scores_window)
            # monitor progress
            if i_episode % 10 == 0:
                print("\rEpisode {:d}/{:d} || Average score {:.2f}".format(
                    i_episode, num_episodes, mean_score))
        else:
            print("\rWarmup episode {:d}/{:d}".format(i_episode,
                                                      warmup_episodes),
                  end="")

        if i_episode % SAVE_EVERY == 0 and i_episode > warmup_episodes:
            maddpg.save_weights(i_episode)

        # check if task is solved
        if i_episode >= 100 and mean_score >= 0.5:
            print(
                '\nEnvironment solved in {:d} episodes. Average score: {:.2f}'.
                format(i_episode, mean_score))
            maddpg.save_weights()
            break
    if i_episode == num_episodes:
        print("\nGame over. Too bad! Final score {:.2f}\n".format(mean_score))
    return episode_scores

Пример #3

Показать файл

Файл: tennis-trained.py Проект: dmavridis/RL-Collaboration-and-Competition

    print('Size of each action:', action_size)
    
    # examine the state space 
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(states.shape[0], state_size))
    print('The state for the first agent looks like:', states[0]) 

    agents = MADDPG(state_size, action_size)
    agents.load_from_file()
    
    for i in range(1, 20):
        env_info = env.reset(train_mode=False)[brain_name]
        states = env_info.vector_observations
        score = np.zeros((2,))
        agents.reset()

        for t in range(100):
            actions = agents.act(states)
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done

            states = next_states
            score += np.array(rewards)
            if any(dones):
                break 

    env.close()

Пример #4

Показать файл

Файл: main.py Проект: albimc/Project_Collaboration_Competition

def main():

    ##########
    # CONFIG #
    ##########
    # Target Reward
    tgt_score = 0.5
    # Device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # Seed
    seed = 7
    seeding(seed)
    # Model Architecture
    # Actor
    hidden_in_actor = 256
    hidden_out_actor = 128
    lr_actor = 1e-4
    # Critic
    hidden_in_critic = 256
    hidden_out_critic = 128
    lr_critic = 3e-4
    weight_decay_critic = 0
    # Episodes
    number_of_episodes = 10000
    episode_length = 2000
    # Buffer
    buffer_size = int(1e6)
    batchsize = 512
    # Agent Update Frequency
    episode_per_update = 1
    # Rewards Discounts Factor
    discount_factor = 0.95
    # Soft Update Weight
    tau = 1e-2
    # Noise Process
    noise_factor = 2
    noise_reduction = 0.9999
    noise_floor = 0.0
    # Window
    win_len = 100
    # Save Frequency
    save_interval = 200
    # Logger
    log_path = os.getcwd() + "/log"
    logger = SummaryWriter(log_dir=log_path)
    # Model Directory
    model_dir = os.getcwd() + "/model_dir"
    os.makedirs(model_dir, exist_ok=True)
    # Load Saved Model
    load_model = False

    ####################
    # Load Environment #
    ####################
    env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64")
    # Get brain
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    print('Brain Name:', brain_name)
    # Reset the environment
    env_info = env.reset(train_mode=True)[brain_name]
    # Number of Agents
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    # size of each action
    action_size = brain.vector_action_space_size
    print('Size of each action:', action_size)
    # examine the state space
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('There are {} agents. Each observes a state with length: {}'.format(
        states.shape[0], state_size))

    ####################
    # Show Progressbar #
    ####################
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()
    start = time.time()

    ###############
    # Multi Agent #
    ###############
    maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor,
                    hidden_out_actor, lr_actor, hidden_in_critic,
                    hidden_out_critic, lr_critic, weight_decay_critic,
                    discount_factor, tau, seed, device)

    if load_model:
        load_dict_list = torch.load(os.path.join(model_dir,
                                                 'episode-saved.pt'))
        for i in range(num_agents):
            maddpg.maddpg_agent[i].actor.load_state_dict(
                load_dict_list[i]['actor_params'])
            maddpg.maddpg_agent[i].actor_optimizer.load_state_dict(
                load_dict_list[i]['actor_optim_params'])
            maddpg.maddpg_agent[i].critic.load_state_dict(
                load_dict_list[i]['critic_params'])
            maddpg.maddpg_agent[i].critic_optimizer.load_state_dict(
                load_dict_list[i]['critic_optim_params'])

    #################
    # Replay Buffer #
    #################
    rebuffer = ReplayBuffer(buffer_size, seed, device)

    #################
    # TRAINING LOOP #
    #################
    # initialize scores
    scores_history = []
    scores_window = deque(maxlen=save_interval)

    # i_episode = 0
    for i_episode in range(number_of_episodes):
        timer.update(i_episode)

        # Reset Environmet
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        scores = np.zeros(num_agents)

        # Reset Agent
        maddpg.reset()

        # episode_t = 0
        for episode_t in range(episode_length):

            # Explore with decaying noise factor
            actions = maddpg.act(states, noise_factor=noise_factor)
            env_info = env.step(actions)[brain_name]  # Environment reacts
            next_states = env_info.vector_observations  # get the next states
            rewards = env_info.rewards  # get the rewards
            dones = env_info.local_done  # see if episode has finished

            ###################
            # Save Experience #
            ###################
            rebuffer.add(states, actions, rewards, next_states, dones)

            scores += rewards
            states = next_states

            if any(dones):
                break

        scores_history.append(np.max(scores))  # save most recent score
        scores_window.append(np.max(scores))  # save most recent score
        avg_rewards = np.mean(scores_window)
        noise_factor = max(noise_floor, noise_factor *
                           noise_reduction)  # Reduce Noise Factor

        #########
        # LEARN #
        #########
        if len(rebuffer) > batchsize and i_episode % episode_per_update == 0:
            for a_i in range(num_agents):
                samples = rebuffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # Soft Update
            maddpg.update_targets()

        ##################
        # Track Progress #
        ##################
        if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")

        ##############
        # Save Model #
        ##############
        save_info = ((i_episode) % save_interval == 0
                     or i_episode == number_of_episodes)
        if save_info:
            save_dict_list = []
            for i in range(num_agents):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)
            torch.save(save_dict_list,
                       os.path.join(model_dir, 'episode-Latest.pt'))

            pd.Series(scores_history).to_csv(
                os.path.join(model_dir, "scores.csv"))

            # plot the scores
            rolling_mean = pd.Series(scores_history).rolling(win_len).mean()
            fig = plt.figure()
            ax = fig.add_subplot(111)
            plt.plot(np.arange(len(scores_history)), scores_history)
            plt.axhline(y=tgt_score, color='r', linestyle='dashed')
            plt.plot(rolling_mean, lw=3)
            plt.ylabel('Score')
            plt.xlabel('Episode #')
            # plt.show()
            fig.savefig(os.path.join(model_dir, 'Average_Score.pdf'))
            fig.savefig(os.path.join(model_dir, 'Average_Score.jpg'))
            plt.close()

        if avg_rewards > tgt_score:
            logger.add_scalars('rewards', {
                'Avg Reward': avg_rewards,
                'Noise Factor': noise_factor
            }, i_episode)
            print(
                '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}'
                .format((time.time() - start) / 60, maddpg.update_count,
                        episode_t),
                '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'.
                format(i_episode, avg_rewards, noise_factor),
                end="\n")
            break

    env.close()
    logger.close()
    timer.finish()

Пример #5

Показать файл

Файл: train.py Проект: mwusdv/udacity_rl_p3_collab-compet

def train_maddpg(env,
                 max_episode=1000,
                 max_t=1000,
                 print_every=5,
                 check_history=100,
                 sigma_start=0.2,
                 sigma_end=0.01,
                 sigma_decay=0.995):
    # reset
    brain_name = env.brain_names[0]
    brain = env.brains[brain_name]
    env_info = env.reset(train_mode=True)[brain_name]

    # action and state size
    action_size = brain.vector_action_space_size
    states = env_info.vector_observations
    state_size = states.shape[1]
    print('State size:', state_size)
    print('Action size: ', action_size)

    # initialize agent
    num_agents = len(env_info.agents)
    print('Number of agents:', num_agents)
    maddpg = MADDPG(state_size, action_size, random_seed=123)

    scores_deque = deque(maxlen=check_history)
    scores = []

    # learning multiple episodes
    sigma = sigma_start
    for episode in range(max_episode):
        # prepare for training in the current epoc
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0
        maddpg.reset(sigma=sigma)

        # play and learn in current episode
        for t in range(max_t):
            actions = maddpg.act(states)

            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations  # get next state (for each agent)
            rewards = env_info.rewards  # get reward (for each agent)
            dones = env_info.local_done  # see if episode finished

            maddpg.step(t, states, actions, rewards, next_states, dones)
            states = next_states

            reward = np.max(
                rewards)  # get max score of two agents as current score
            score += reward
            if np.any(dones):
                break

        # update sigma for exlporation
        sigma = max(sigma_end, sigma * sigma_decay)

        # record score
        epoc_score = score
        scores_deque.append(epoc_score)
        scores.append(epoc_score)

        if episode % print_every == 0:
            print('Episode {}\tscore: {:.4f}\tAverage Score: {:.4f}'.format(
                episode, epoc_score, np.mean(scores_deque)))

        if np.mean(scores_deque) >= 0.5 and episode >= check_history:
            print(
                '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.4f}'
                .format(episode - check_history, np.mean(scores_deque)))
            for agent in maddpg.ddpg_agents:
                torch.save(agent.actor_local.state_dict(),
                           'actor_agent_' + str(agent.id) + '.pth')
                torch.save(agent.critic_local.state_dict(),
                           'critic_agent_' + str(agent.id) + '.pth')
            break

    return scores