Python make_parallel_env примеры использования

Язык программирования: Python

Пространство имен/Пакет: envs

Метод/Функция: make_parallel_env

Примеров на hotexamples.com: 6

Python make_parallel_env - 6 примеров найдено. Это лучшие примеры Python кода для envs.make_parallel_env, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 10000
    episode_length = 100
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 5000
    # what is this ?
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    # this may be a list of all environments
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    # this creates a list of models, each element in the list refers to an agent in the simulation
    # [agent_one_ddpg, agent_two_ddpg, ...]
    # agent_one_ddpg contains the agent actor and critic models,e.g., agent_one_ddpg.actor, agent_one_ddpg.critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    # for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
    # notice we jump forward by number of parallel environments
    for episode in range(0, number_of_episodes, parallel_envs):
        timer.update(episode)

        # i believe there are as many as number of agents times parallel env reward
        reward_this_episode = np.zeros((parallel_envs, 3))
        # obs is the observation state space of all the three agents in the 4 parallel env.
        # for the Physical Dception environment with three agents it is of dimension 4x3x14.
        # obs_full is world state irrespective of the agents and its dimension is 4x14.
        # all_observation = array(number of environments 4, 2 elements)
        # element 0 : is a list that contains 3 arrays. contains the state for each agent, each state is of size 14
        # element 1 : global state from the perspective of the target/green for its environment. contains 14 elements
        all_obs = env.reset()
        # obs : is a list that has 1 element per environment. each element contains a list of 3 array.
        # each array is the state of one agent in that environment.
        # obs_full: is the god eye view of each environment. So it a list, that has 1 element per environment
        # each element contains an array of 14 values which is the global state of that environment
        obs, obs_full = transpose_list(all_obs)

        #for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):
            # we finish the episode before sampling the buffer for trainint
            # t jumps forward in a multiple of environment
            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            # the transpose_to_tensor(obs) changes the data to each agent point of view
            # since we have 4 environments, there are 4 agent 1, 4 agent 2, and 4 agent 3
            # each agent has a state in each environment, total states across 4 environments for agent 1 is 4x14 tensor
            # transpose_to_tensor(obs) = is a list of 3 elements. each element is for 1 agent
            # pick element 1. this is an array of 4x14 elements of agent observation across 4 environments.
            # maddpg.act has a for loop that take each element of obs and pass it to the agents actor models and
            # to generate an action from each agent actor.
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction
            # there are 4 actions per agent and 3 agents, total of 12 . Each action has 2 elements force in x, y direct
            # actions_array is a tensor of shape (3 agent, 4 env, 2 action)
            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            # the shape of actions_for_env is (4 env, 3 agent, 2 action)
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            # obs is the observation state space of all the three agents in the 4 parallel env.
            # for the Physical Dception environment with three agents it is of dimension 4x3x14.
            # obs_full is world state irrespective of the agents and its dimension is 4x14.
            # To gain more understanding, please see the code in the multiagent folder.
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                # although samples are drawn randomly, for each sample we have all 3 agents data, and we know which
                # reward and actions belong to which agent
                # samples is a list of 7 elements: obs, obs_full, action, reward, next_obs, next_obs_full, done
                # each element of sample, say samples[0] is a list of 3 elements, one for each agent
                # each agent element contains their corresponding value, for example in case of obs it would be a
                # vector with 14 values
                # so when i ask for 2 samples for examples, i get 2 samples each containing all 3 agents states, rewards
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  #soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        #saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()

Пример #2

Показать файл

def main():
    seeding()
    # number of parallel agents
    parallel_envs = 4
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in range(0, number_of_episodes + parallel_envs, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        all_obs = env.reset()
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for this particular episode - addition of all time steps

        # save info or not
        save_info = (episode % save_interval < parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = env.step(
                actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update once after every episode_per_update
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            maddpg.update_targets(
            )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            for i in range(3):

                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()

Пример #3

Показать файл

def main():
seeding()
# number of parallel agents
parallel_envs = 4
# number of training episodes.
# change this to higher number to experiment. say 30000.
number_of_episodes = 1000
episode_length = 80
batchsize = 1000
# how many episodes to save policy and gif
save_interval = 1000
t = 0


# amplitude of OU noise
# this slowly decreases to 0
noise = 2
noise_reduction = 0.9999

# how many episodes before update
episode_per_update = 2 * parallel_envs

log_path = os.getcwd() + '/log'
model_dir = os.getcwd() + '/model_dir'

os.makedirs(model_dir, exist_ok=True)

torch.set_num_threads(parallel_envs)
env = envs.make_parallel_env(parallel_envs)

# keep 5000 episodes worth of replay
buffer = ReplayBuffer(int(5000 * episode_length))

# initialize policy and critic
maddpg = MADDPG()
logger = SummaryWriter(log_dir = log_path)
agent0_reward = []
agent1_reward = []
agent2_reward = []

# training loop
# show progressbar
import progressbar as pb
widget = ['episode: ', pb.Counter(), '/', str(number_of_episodes), ' ',
		  pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ]

timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

# use keep_awake to keep workspace from disconnecting
# for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):
for episode in range(0, number_of_episodes, parallel_envs):

	timer.update(episode)

	reward_this_episode = np.zeros((parallel_envs, 3))
	all_obs = env.reset()
	obs, obs_full = transpose_list(all_obs)

	# for calculating rewards for this particular episode - addition of all time steps

	# save info or not
	save_info = ((episode) % save_interval < parallel_envs or episode==number_of_episodes-parallel_envs)
	frames = []
	tmax = 0

	if save_info:
		frames.append(env.render('rgb_array'))

Пример #4

Показать файл

def main():
    seeding()
    parallel_envs = 4
    number_of_episodes = 1000
    episode_length = 80
    batchsize = 1000
    save_interval = 1000
    t = 0

    # amplitude of OU noise, which slowly decreases to 0
    noise = 2
    noise_reduction = 0.9999

    # how many episodes before update
    episode_per_update = 2 * parallel_envs

    log_path = os.getcwd() + "/log"
    model_dir = os.getcwd() + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    torch.set_num_threads(parallel_envs)
    """
    `env` controls three agents, two blue, one red.
    env.observation_space: [Box(14,), Box(14,), Box(14,)]
    env.action_sapce: [Box(2,), Box(2,), Box(2,)]
    Box(14,) can be broken down into 2+3*2+3*2=14
    (2) location coordinates of the target landmark
    (3*2) the three agents' positions w.r.t. the target landmark
    (3*2) the three agents' velocities w.r.t. the target landmark
    """
    env = envs.make_parallel_env(parallel_envs)

    # keep 5000 episodes worth of replay
    buffer = ReplayBuffer(int(5000 * episode_length))

    # initialize policy and critic
    maddpg = MADDPG()
    logger = SummaryWriter(log_dir=log_path)
    agent0_reward = []
    agent1_reward = []
    agent2_reward = []

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        'episode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]

    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    # use keep_awake to keep workspace from disconnecting
    for episode in keep_awake(range(0, number_of_episodes, parallel_envs)):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, 3))
        # Consult `env_wrapper.py` line 19.
        all_obs = env.reset()
        """
        `all_abs` is a list of size `parallel_envs`,
        each item in the list is another list of size two,
        first is env.observation_space: [Box(14,), Box(14,), Box(14,)],
        second is [Box(14,)], which is added to faciliate training
        https://goo.gl/Xtr6sF
        `obs` and `obs_full` are both lists of size `parallel_envs`,
        `obs` has the default observation space [Box(14,), Box(14,), Box(14,)]
        `obs_full` has the compounded observation space [Box(14,)]
        """
        obs, obs_full = transpose_list(all_obs)

        # for calculating rewards for one episode - addition of all time steps

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        if save_info:
            frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            t += parallel_envs

            # explore = only explore for a certain number of steps
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)
            noise *= noise_reduction

            # `actions_array` has shape (3, parallel_envs, 2)
            actions_array = torch.stack(actions).detach().numpy()
            # `actions_for_env` has shape (parallel_envs, 3, 2), because
            # input to `step` requires the first index to be `parallel_envs`
            actions_for_env = np.rollaxis(actions_array, axis=1)

            # step forward one frame
            next_obs, next_obs_full, rewards, dones, info = \
                env.step(actions_for_env)

            # add data to buffer
            transition = (obs, obs_full, actions_for_env, rewards, next_obs,
                          next_obs_full, dones)

            buffer.push(transition)

            reward_this_episode += rewards

            obs, obs_full = next_obs, next_obs_full

            # save gif frame
            if save_info:
                frames.append(env.render('rgb_array'))
                tmax += 1

        # update the target network `parallel_envs`=4 times
        # after every `episode_per_update`=2*4
        if len(buffer
               ) > batchsize and episode % episode_per_update < parallel_envs:
            # update the local network for all agents, `a_i` refers to agent no.
            for a_i in range(3):
                samples = buffer.sample(batchsize)
                maddpg.update(samples, a_i, logger)
            # soft update the target network towards the actual networks
            maddpg.update_targets()

        for i in range(parallel_envs):
            agent0_reward.append(reward_this_episode[i, 0])
            agent1_reward.append(reward_this_episode[i, 1])
            agent2_reward.append(reward_this_episode[i, 2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            avg_rewards = [
                np.mean(agent0_reward),
                np.mean(agent1_reward),
                np.mean(agent2_reward)
            ]
            agent0_reward = []
            agent1_reward = []
            agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # Saves the model.
        save_dict_list = []
        if save_info:
            for i in range(3):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # Save gif files.
            imageio.mimsave(os.path.join(model_dir,
                                         'episode-{}.gif'.format(episode)),
                            frames,
                            duration=.04)

    env.close()
    logger.close()
    timer.finish()

Пример #5

Показать файл

def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5

    # initialize environment
    torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs,
                                 seed=SEED,
                                 num_agents=num_agents,
                                 benchmark=BENCHMARK)

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])

    # trained_checkpoint = r'E:\Ivan\UPC\UDACI TY\DRL_Nanodegree\Part4\MADDPG\032521_163018\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032521_211315\model_dir\episode-59994.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032621_054252\model_dir\episode-36000.pt' #test1 2 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032821_102717\model_dir\episode-99000.pt' #test1 6 agents
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\032921_160324\model_dir\episode-99000.pt' #test2 6 agents pretrined
    # trained_checkpoint = r'E:\Ivan\UPC\UDACITY\DRL_Nanodegree\Part4\MADDPG\033021_203450\model_dir\episode-73002.pt'  # test2 6 agents pretrined
    # trained_checkpoint = "c4-a4-n01-a01-old-two-noskip/model_dir/episode-59999.pt"
    trained_checkpoint = "gat-huge1k/model_dir/episode-99000.pt"
    aux = torch.load(trained_checkpoint, map_location=torch.device('cpu'))

    for i in range(num_agents):
        # load the weights from file
        maddpg.maddpg_agent[i].actor.load_state_dict(aux[i]['actor_params'])
        maddpg.maddpg_agent[i].critic.load_state_dict(aux[i]['critic_params'])

    # Reset the environment
    all_obs = env.reset()
    # flip the first two indices
    obs_roll = np.rollaxis(all_obs, 1)
    obs = transpose_list(obs_roll)

    scores = 0
    t = 0
    while True:
        all_obs = env.reset()
        # flip the first two indices
        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)
        scores = 0
        t = 0
        for _ in range(25):
            env.render('rgb_array')
            time.sleep(0.1)
            t += 1
            # select an action
            actions = maddpg.act(transpose_to_tensor(obs), noise=0.)
            actions_array = torch.stack(actions).detach().numpy()
            actions_for_env = np.rollaxis(actions_array, 1)
            # send all actions to the environment
            next_obs, rewards, dones, info = env.step(actions_for_env)
            # update the score (for each agent)
            scores += np.sum(rewards)
            print('\r\n Rewards at step %i = %.3f' % (t, scores))
            # for displaying learned policies

            # time.sleep(0.1)
            # env.render()

            # roll over states to next time step
            obs = next_obs
            # print("Score: {}".format(scores))
            if np.any(dones):
                print('done')
                print('Next:')
    env.close()

Пример #6

Показать файл

def main():
    seeding(seed=SEED)
    # number of parallel agents
    parallel_envs = 1
    # number of agents per environment
    num_agents = 5
    # number of training episodes.
    # change this to higher number to experiment. say 30000.
    number_of_episodes = 60000
    episode_length = 35
    # how many episodes to save policy and gif
    save_interval = 1000
    t = 0
    scenario_name = "simple_spread_ivan"

    # amplitude of OU noise
    # this slowly decreases to 0
    noise = 0.5  # was 2, try 0.5, 0.2
    noise_reduction = 0.9999  # 0.999
    #### DECAY
    initial_noise = 0.1
    decay = 0.01

    # how many episodes before update
    # episode_per_update = UPDATE_EVERY * parallel_envs
    common_folder = time.strftime("/%m%d%y_%H%M%S")
    log_path = os.getcwd() + common_folder + "/log"
    model_dir = os.getcwd() + common_folder + "/model_dir"

    os.makedirs(model_dir, exist_ok=True)

    # initialize environment
    # torch.set_num_threads(parallel_envs)
    env = envs.make_parallel_env(parallel_envs, seed=3, benchmark=BENCHMARK)
    # env = envs.make_env("simple_spread_ivan")

    # initialize replay buffer
    buffer = ReplayBuffer(int(BUFFER_SIZE))

    # initialize policy and critic
    maddpg = MADDPG(num_agents=num_agents,
                    discount_factor=GAMMA,
                    tau=TAU,
                    lr_actor=LR_ACTOR,
                    lr_critic=LR_CRITIC,
                    weight_decay=WEIGHT_DECAY)
    logger = SummaryWriter(log_dir=log_path)

    agents_reward = []
    for n in range(num_agents):
        agents_reward.append([])
    # agent0_reward = []
    # agent1_reward = []
    # agent2_reward = []

    agent_info = [[[]]]  # placeholder for benchmarking info

    # training loop
    # show progressbar
    import progressbar as pb
    widget = [
        '\repisode: ',
        pb.Counter(), '/',
        str(number_of_episodes), ' ',
        pb.Percentage(), ' ',
        pb.ETA(), ' ',
        pb.Bar(marker=pb.RotatingMarker()), ' '
    ]
    timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start()

    print('Starting iterations...')
    for episode in range(0, number_of_episodes, parallel_envs):

        timer.update(episode)

        reward_this_episode = np.zeros((parallel_envs, num_agents))

        all_obs = env.reset()  #

        # flip the first two indices
        # ADD FOR WITHOUT PARALLEL ENV
        # all_obs = np.expand_dims(all_obs, axis=0)

        obs_roll = np.rollaxis(all_obs, 1)
        obs = transpose_list(obs_roll)

        # save info or not
        save_info = ((episode) % save_interval < parallel_envs
                     or episode == number_of_episodes - parallel_envs)
        frames = []
        tmax = 0

        # if save_info:
        # frames.append(env.render('rgb_array'))

        for episode_t in range(episode_length):

            # get actions
            # explore = only explore for a certain number of episodes
            # action input needs to be transposed
            actions = maddpg.act(transpose_to_tensor(obs), noise=noise)

            noise = max(initial_noise * decay**(episode_t / 20000), 0.001)
            # noise = max(noise*noise_reduction, 0.001)

            actions_array = torch.stack(actions).detach().numpy()

            # transpose the list of list
            # flip the first two indices
            # input to step requires the first index to correspond to number of parallel agents
            actions_for_env = np.rollaxis(actions_array, 1)

            # environment step
            # step forward one frame
            # next_obs, next_obs_full, rewards, dones, info = env.step(actions_for_env)

            # ADD FOR WITHOUT PARALLEL ENV
            # next_obs, rewards, dones, info = env.step(actions_for_env)
            next_obs, rewards, dones, info = env.step(actions_for_env)

            # rewards_sum += np.mean(rewards)

            # collect experience
            transition = (obs, actions_for_env, rewards, next_obs, dones)
            buffer.push(transition)

            reward_this_episode += rewards

            # obs, obs_full = next_obs, next_obs_full
            obs = next_obs

            # increment global step counter
            t += parallel_envs

            # save gif frame
            if save_info:
                # frames.append(env.render('rgb_array'))
                tmax += 1

            # for benchmarking learned policies
            if BENCHMARK:
                for i, inf in enumerate(info):
                    agent_info[-1][i].append(inf['n'])

        # update once after every episode_per_update
        # if len(buffer) > BATCH_SIZE and episode % episode_per_update < parallel_envs:
        if len(buffer) > BATCH_SIZE and episode % UPDATE_EVERY < parallel_envs:
            for _ in range(UPDATE_TIMES):
                for a_i in range(num_agents):
                    samples = buffer.sample(BATCH_SIZE)
                    maddpg.update(samples, a_i, logger)
                maddpg.update_targets(
                )  # soft update the target network towards the actual networks

        for i in range(parallel_envs):
            for n in range(num_agents):
                agents_reward[n].append(reward_this_episode[i, n])
            # agent0_reward.append(reward_this_episode[i,0])
            # agent1_reward.append(reward_this_episode[i,1])
            # agent2_reward.append(reward_this_episode[i,2])

        if episode % 100 == 0 or episode == number_of_episodes - 1:
            # avg_rewards = [np.mean(agent0_reward), np.mean(agent1_reward), np.mean(agent2_reward)]
            avg_rewards = []
            for n in range(num_agents):
                avg_rewards.append(np.mean(agents_reward[n]))
                # agent0_reward = []
            # agent1_reward = []
            # agent2_reward = []
            for a_i, avg_rew in enumerate(avg_rewards):
                logger.add_scalar('agent%i/mean_episode_rewards' % a_i,
                                  avg_rew, episode)

        # saving model
        save_dict_list = []
        if save_info:
            print('agent_info benchmark=', agent_info)
            for i in range(5):
                save_dict = {
                    'actor_params':
                    maddpg.maddpg_agent[i].actor.state_dict(),
                    'actor_optim_params':
                    maddpg.maddpg_agent[i].actor_optimizer.state_dict(),
                    'critic_params':
                    maddpg.maddpg_agent[i].critic.state_dict(),
                    'critic_optim_params':
                    maddpg.maddpg_agent[i].critic_optimizer.state_dict()
                }
                save_dict_list.append(save_dict)

                torch.save(
                    save_dict_list,
                    os.path.join(model_dir, 'episode-{}.pt'.format(episode)))

            # save gif files
            # imageio.mimsave(os.path.join(model_dir, 'episode-{}.gif'.format(episode)),
            #                 frames, duration=.04)

    env.close()
    logger.close()
    timer.finish()