Exemplo n.º 1
0
    def create_environment(self):
        """
        Create environment instance
        :return: environment (gym interface), env_name, task_name, n_agents, observation_sizes,
                 action_sizes, discrete_actions
        """
        # load scenario from script
        if self.arglist.partial_observable:
            scenario = scenarios.load(self.arglist.scenario +
                                      "_partial_observable.py").POScenario()
        elif self.arglist.observation_noise:
            scenario = scenarios.load(self.arglist.scenario +
                                      "_observation_noise.py").ONScenario()
        elif self.arglist.environment_noise:
            scenario = scenarios.load(self.arglist.scenario +
                                      "_env_noise.py").ENScenario()
        else:
            scenario = scenarios.load(self.arglist.scenario + ".py").Scenario()

        # create world
        world = scenario.make_world()
        # create multiagent environment
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation)

        env_name = "mape"
        task_name = "mape_" + self.arglist.scenario

        n_agents = env.n
        print("Observation spaces: ",
              [env.observation_space[i] for i in range(n_agents)])
        print("Action spaces: ",
              [env.action_space[i] for i in range(n_agents)])
        observation_sizes = self.extract_sizes(env.observation_space)
        action_sizes = self.extract_sizes(env.action_space)
        discrete_actions = True

        return (
            env,
            env_name,
            task_name,
            n_agents,
            observation_sizes,
            action_sizes,
            discrete_actions,
        )
Exemplo n.º 2
0
 def get_env(self,
             world,
             reset_callback=None,
             reward_callback=None,
             observation_callback=None,
             info_callback=None,
             done_callback=None,
             shared_viewer=True,
             discrete_action_space=True):
     return MultiAgentEnv(world,
                          reset_callback,
                          reward_callback,
                          observation_callback,
                          info_callback,
                          done_callback,
                          shared_viewer,
                          discrete_action_space=discrete_action_space)
Exemplo n.º 3
0
def create_env(env_name):
    if env_name not in all_envs():
        raise RuntimeError("Invalid multi-agent environment: " + env_name)
    # load scenario from script
    scenario = scenarios.load(env_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(
        world,
        scenario.reset_world,
        scenario.reward,
        scenario.observation,
        info_callback=None,
        shared_viewer=False,
    )
    return env
Exemplo n.º 4
0
def make_env(args):
    """Make multi-agent particle environment
    Ref: https://github.com/openai/maddpg/blob/master/experiments/train.py
    """
    scenario = scenarios.load(args.env_name + ".py").Scenario()
    world = scenario.make_world()
    done_callback = None

    env = MultiAgentEnv(
        world,
        reset_callback=scenario.reset_world,
        reward_callback=scenario.reward,
        observation_callback=scenario.observation,
        done_callback=done_callback)

    assert env.discrete_action_space is False, "For cont. action, this flag must be False"

    return env
Exemplo n.º 5
0
def create_env(env_name):
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(env_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(
        world,
        scenario.reset_world,
        scenario.reward,
        scenario.observation,
        info_callback=None,
        shared_viewer=False,
    )
    return env
Exemplo n.º 6
0
def make_multiagent_env(env_id, num_agents, dist_threshold, arena_size,
                        identity_size):
    scenario = scenarios.load(env_id + ".py").Scenario(
        num_agents=num_agents,
        dist_threshold=dist_threshold,
        arena_size=arena_size,
        identity_size=identity_size)
    world = scenario.make_world()

    env = MultiAgentEnv(
        world=world,
        reset_callback=scenario.reset_world,
        reward_callback=scenario.reward,
        observation_callback=scenario.observation,
        info_callback=scenario.info if hasattr(scenario, 'info') else None,
        discrete_action=True,
        done_callback=scenario.done,
        cam_range=arena_size)
    return env
Exemplo n.º 7
0
def make_env(args):
    # set scenario
    scenario = scenarios.load(args['scenario'] + ".py").Scenario()
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    # both good and bad agents
    args['n_players'] = env.n
    # train only good agents
    args['n_agents'] = env.n - args['n_enemies']
    # obtain shapes of inodividual obs of agents
    args['obs_shape'] = [
        env.observation_space[i].shape[0] for i in range(args['n_agents'])
    ]
    action_shape = []
    for content in env.action_space:
        action_shape.append(content.n)
    args['action_shape'] = action_shape[:args['n_agents']]
    args['high_action'] = 1
    args['low_action'] = -1
    return env, args
Exemplo n.º 8
0
def make_env(args):
    """Load multi-agent particle environment
    This code is modified from: https://github.com/openai/maddpg/blob/master/experiments/train.py
    """
    # Check github branch
    check_github(path="./thirdparty/multiagent-particle-envs",
                 branch_name="opponent")

    # Load multi-agent particle env
    scenario = scenarios.load(args.env_name + ".py").Scenario()
    world = scenario.make_world()
    done_callback = None

    env = MultiAgentEnv(world,
                        reset_callback=scenario.reset_world,
                        reward_callback=scenario.reward,
                        observation_callback=scenario.observation,
                        done_callback=done_callback)

    assert env.discrete_action_space is False, "For cont. action, this flag must be False"

    return env
Exemplo n.º 9
0
def make_env(scenario_name, args):
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # set fixed partition
    if args.partition != 'rand':
        print('>> Ensemble Partition Type = {}!'.format(args.partition))
        scenario.partition = args.partition
    if args.partition_flag is not None:
        print('>> Partition Flag = {}!'.format(args.partition_flag))
        scenario.partition_flag = args.partition_flag
    if args.evaluate and args.measure_success:
        print('>> Evaluating Success Rate!')
        scenario.measure_success = True
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    return env
Exemplo n.º 10
0
def make_env(scenario_name, arglist, benchmark=False):
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios
    global scenario

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    env.window_pos = 'right'
    env.force_discrete_action = False
    return env
Exemplo n.º 11
0
def train(scenario):
    path_to_save = 'models/' + scenario.__module__.split('.')[-1] + '/simple'
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    world = scenario.make_world()
    env = MultiAgentEnv(
        world,
        reset_callback=scenario.reset_world,
        reward_callback=scenario.reward,
        observation_callback=scenario.observation,
        info_callback=None,
        done_callback=scenario.done,
        shared_viewer=True,
    )

    with U.single_threaded_session() as sess:
        simple_agents = [VectorAgent(env, 0, 1),
                         StayAgent(env, 1)]  # good agent
        evaluator = evaluate_models.Evaluator(args,
                                              scenario,
                                              save=scenario.name + '/' +
                                              str(1))
        evaluator.evaluate(env, simple_agents, 0)
Exemplo n.º 12
0
def _make_env(scenario_name, horizon, monitor_enabled, video_frequency):
    if scenario_name in CUSTOM_SCENARIOS:
        # Scenario file must exist locally
        file_path = os.path.join(os.path.dirname(__file__),
                                 scenario_name + '.py')
        scenario = imp.load_source('', file_path).Scenario()
    else:
        scenario = scenarios.load(scenario_name + '.py').Scenario()

    world = scenario.make_world()

    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)
    env.metadata['video.frames_per_second'] = 8

    env = ParticleEnvRenderWrapper(env, horizon)

    if not monitor_enabled:
        return env

    return wrappers.Monitor(env,
                            './logs/videos',
                            resume=True,
                            video_callable=_video_callable(video_frequency))
Exemplo n.º 13
0
def train(scenario):
    train_n = 0
    world = scenario.make_world()
    env = MultiAgentEnv(world, reset_callback=scenario.reset_world, reward_callback=scenario.reward,
                        observation_callback=scenario.observation, info_callback=None,
                        done_callback=scenario.done, collision_callback=scenario.is_collision,
                        shared_viewer=True, )
    evaluator = evaluate_models.Evaluator(args, scenario, save=scenario.name + '/' + str(train_n))


    simple_agents = [StayAgent(env, 1), VectorAgent(env,0,1)] #good agent

    policies = [simple_agents[1], simple_agents[0]]
    print('agents is created')
    statistics_header = ["episode"]
    statistics_header.append("steps")
    statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
    statistics_header.extend(["q_{}".format(i) for i in range(env.n)])
    statistics = utilities.Time_Series_Statistics_Store(
        statistics_header)

    statistics.dump("{}_{}.csv".format(
        args.experiment_prefix + scenario.__module__.split('.')[-1], 0))
    evaluator.evaluate(env, policies, 0)
Exemplo n.º 14
0
def make_env():
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    args = parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world,
                        scenario.reset_world,
                        scenario.reward,
                        scenario.observation,
                        info_callback=None,
                        shared_viewer=True)

    args.action_dimension = env.action_space[0].n
    args.observation_dimension = env.observation_space[0].shape[0]
    args.low_action = 0
    args.high_action = 1

    return env, args
Exemplo n.º 15
0
def make_env(scenario_name, arglist, benchmark=False):
    global scenario
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    scenario.individual_reward = arglist.indv_rew
    scenario.cooperative_reward = arglist.coop_rew
    scenario.crash_punishment = arglist.crash_pun

    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation)
    env.window_pos = 'left'
    env.force_discrete_action = False
    return env
Exemplo n.º 16
0
def make_env(scenario_name, benchmark=False):
    '''
    Creates a MultiAgentEnv object as env. This can be used similar to a gym
    environment by calling env.reset() and env.step().
    Use env.render() to view the environment on the screen.

    Input:
        scenario_name   :   name of the scenario from ./scenarios/ to be Returns
                            (without the .py extension)
        benchmark       :   whether you want to produce benchmarking data
                            (usually only done during evaluation)

    Some useful env properties (see environment.py):
        .observation_space  :   Returns the observation space for each agent
        .action_space       :   Returns the action space for each agent
        .n                  :   Returns the number of Agents
    '''
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    world.dim_c = 0
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                            scenario.observation)
    env.discrete_action_space = False
    env.discrete_action_input = False
    scenario.reset_world(world)
    return env, scenario, world
Exemplo n.º 17
0
    with torch.no_grad():
        prediction = net(obs, actions)
        loss = F.mse_loss(torch.tensor(next_obs).to(args.device),
                          prediction).item()
    net.train()
    return loss


if __name__ == '__main__':
    args = parse_commandline()
    print(args)
    scenario = scenarios.load(args.scenario).Scenario()
    world = scenario.make_world()
    env = MultiAgentEnv(world,
                        scenario.reset_world,
                        scenario.reward,
                        scenario.observation,
                        info_callback=None,
                        shared_viewer=False)
    buffer, eval_set = create_buffer(env, size=args.buffer_size)

    with open('buffer.pkl', 'wb') as dst:
        pickle.dump([buffer, eval_set], dst)
    with open('buffer.pkl', 'rb') as src:
        buffer, eval_set = pickle.load(src)
    print(f'Buffer created with size {len(buffer)}')
    net = Network()
    net.to(args.device)
    # net.load('model.torch')
    net = train(net, buffer, eval_set, args)
    net.save('model.torch')
Exemplo n.º 18
0
    # parse arguments
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-s',
                        '--scenario',
                        default='simple.py',
                        help='Path of the scenario Python script.')
    args = parser.parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario).Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world,
                        scenario.reset_world,
                        scenario.reward,
                        scenario.observation,
                        info_callback=None,
                        shared_viewer=False)

    # 定义使用 DQN 的算法
    RL = DeepQNetwork(
        n_actions=env.action_space[0].n,
        n_features=env.observation_space[0].shape[0],
        learning_rate=0.01,
        e_greedy=0.9,
        replace_target_iter=100,
        memory_size=2000,
        e_greedy_increment=0.0008,
    )

    total_steps = 0
Exemplo n.º 19
0
import numpy as np

from multiagent.environment import MultiAgentEnv

import multiagent.scenarios as scenarios
from keras.models import load_model

from time import sleep

# load scenario from script
scenario = scenarios.load("simple" + ".py").Scenario()
# create world
world = scenario.make_world()
# create multiagent environment

env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                    scenario.observation)

nb_agent = len(env.agents)
action_space_n = env.action_space
observation_space_n = env.observation_space

agents = [load_model("../agent-" + str(i) + ".model") for i in range(nb_agent)]

while True:
    state_n = env.reset()
    action_space_n = env.action_space
    env.discrete_action_input = True

    state_n = [
        np.reshape(state_n[i], (1, observation_space_n[i].shape[0]))
        for i in range(nb_agent)
Exemplo n.º 20
0
import multiagent.scenarios as scenarios

sys.path.insert(1, os.path.join(sys.path[0], '..'))

if __name__ == '__main__':
    # parse arguments
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
    args = parser.parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario).Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
    # render call to create viewer window (necessary only for interactive policies)
    env.render()
    # create interactive policies for each agent
    policies = [InteractivePolicy(env,i) for i in range(env.n)]
    # execution loop
    obs_n = env.reset()
    while True:
        # query for action from each agent's policy
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
        # step environment
        obs_n, reward_n, done_n, _ = env.step(act_n)
        # render all agent views
        env.render()
Exemplo n.º 21
0
from multiagent.environment import MultiAgentEnv
from multiagent.policy import InteractivePolicy
import multiagent.scenarios as scenarios

if __name__ == '__main__':
    # parse arguments
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-s', '--scenario', default='simple.py', help='Path of the scenario Python script.')
    args = parser.parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario).Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer = False)
    # render call to create viewer window (necessary only for interactive policies)
    env.render()
    # create interactive policies for each agent
    policies = [InteractivePolicy(env,i) for i in range(env.n)]
    # execution loop
    obs_n = env.reset()
    while True:
        # query for action from each agent's policy
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
        # step environment
        obs_n, reward_n, done_n, _ = env.step(act_n)
        # render all agent views
        env.render()
Exemplo n.º 22
0
import pickle

from maddpg import MADDPG
# load the setting of the environment.
scenario = scenarios.load(
    '/home/zw/lyy/maddpg/multiagent-particle-envs/multiagent/scenarios/simple_tag_non_adv_4.py'
).Scenario()

output = open('data_saq_test.pkl', 'wb')

# create world
world = scenario.make_world()
# create multiagent environment
env = MultiAgentEnv(world,
                    scenario.reset_world,
                    scenario.reward,
                    scenario.observation,
                    info_callback=None,
                    shared_viewer=True)
#
world.train_or_test = True
n_agents = env.n
# some initial training parameters
n_actions = world.dim_p
# the capacity of the experience memory
capacity = 1000000

batch_size = 1000
totalTime = 0
n_episode = 3000
max_steps = 100
# before training, we will store the experience of all agents' state information for the next training process.
Exemplo n.º 23
0
def main():

    # Experiment Configuration
    episodes = 5000
    steps_per_episode = 200
    output_dir = '../data/'

    # Load the simulation scenario
    scenario = scenarios.load("decentralized_safe.py").Scenario()
    world = scenario.make_world()

    # Environment Setup
    env = MultiAgentEnv(world,
                        scenario.reset_world,
                        scenario.reward,
                        scenario.observation,
                        info_callback=None,
                        constraint_callback=scenario.constraints,
                        shared_viewer=True)

    # The scenario parameters
    env_params = env.get_env_parameters()
    state_dim = env_params["state_dim"]
    action_dim = env_params["act_dim"]
    constraint_dim = env_params["constraint_dim"]
    num_agents = env_params["num_agents"]

    # Data Storage Containers
    size = episodes * (steps_per_episode - 1)
    state_buf = np.zeros([size, state_dim * num_agents])
    action_buf = np.zeros([size, action_dim * num_agents])
    constraint_diff = np.zeros([size, constraint_dim * num_agents])

    # Simulate the environment and generate dataset for constraints networks
    for episode in range(episodes):
        print(f'episode={episode}')

        # Episode "Preprocessing"
        state = env.reset()
        constraint_old = np.zeros([constraint_dim])

        for step in range(steps_per_episode):

            # Simulation
            action = np.random.uniform(-1, 1, action_dim * num_agents)
            action = np.split(action, num_agents)

            # Deep Copy the agent's action (otherwise it's altered in env.step())
            action_copy = copy.deepcopy(action)
            next_state, reward, _, _, constraint = env.step(action_copy)

            # Omit first simulation step
            if step == 0:
                constraint_old = constraint
                continue

            # Constraint diff
            diff = list(map(operator.sub, constraint, constraint_old))
            constraint_old = constraint

            # Store stuff to buffers for training
            idx = episode * (steps_per_episode - 1) + step - 1
            state_buf[idx, :] = np.concatenate(state)
            action_buf[idx, :] = np.concatenate(action)
            constraint_diff[idx, :] = np.concatenate(diff)

            # update state
            state = next_state

    # Export Results for training
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    pd.DataFrame(state_buf).to_csv(output_dir + "D_state_decentralized.csv")
    pd.DataFrame(action_buf).to_csv(output_dir + "D_action_decentralized.csv")
    pd.DataFrame(constraint_diff).to_csv(output_dir +
                                         "D_constraint_decentralized.csv")
    print("Done... Data saved")
'''define the scenario name'''
scenario_name = 'simple_spread'

'''define the special property'''
# independentArgs = namedtuple( 'independentArgs', [] )
aux_args = AuxArgs[model_name]()
alias = '_new_6'

'''load scenario from script'''
scenario = scenario.load(scenario_name+".py").Scenario()

'''create world'''
world = scenario.make_world()

'''create multiagent environment'''
env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True)
env = GymWrapper(env)

MergeArgs = namedtuple('MergeArgs', Args._fields+AuxArgs[model_name]._fields)

# under offline trainer if set batch_size=replay_buffer_size=update_freq -> epoch update
args = Args(model_name=model_name,
            agent_num=env.get_num_of_agents(),
            hid_size=32,
            obs_size=np.max(env.get_shape_of_obs()),
            continuous=False,
            action_dim=np.max(env.get_output_shape_of_act()),
            init_std=0.1,
            policy_lrate=1e-3,
            value_lrate=1e-2,
            max_steps=200,
Exemplo n.º 25
0
def train_function(config):

    # ----------- Alg parameters ----------------- #
    experiment = config['experiment']
    if experiment == "particle":
        scenario_name = config['scenario']
    seed = config['seed']
    np.random.seed(seed)
    random.seed(seed)

    # Curriculum stage
    stage = config['stage']
    port = config['port']
    dir_name = config['dir_name']
    dir_restore = config['dir_restore']
    use_alg_credit = config['use_alg_credit']
    use_qmix = config['use_qmix']
    use_Q_credit = config['use_Q_credit']
    # If 1, then uses Q-net and global reward
    use_Q = config['use_Q']
    use_V = config['use_V']
    if experiment == "sumo":
        dimensions = config['dimensions_sumo']
    elif experiment == "particle":
        dimensions = config['dimensions_particle']
    # If 1, then restores variables from same stage
    restore_same_stage = config['restore_same_stage']
    # If 1, then does not restore variables, even if stage > 1
    train_from_nothing = config['train_from_nothing']
    # Name of model to restore
    model_name = config['model_name']
    # Total number of training episodes
    N_train = config['N_train']
    period = config['period']
    # Number of evaluation episodes to run every <period>
    N_eval = config['N_eval']
    summarize = config['summarize']
    alpha = config['alpha']
    lr_Q = config['lr_Q']
    lr_V = config['lr_V']
    lr_actor = config['lr_actor']
    dual_buffer = config['dual_buffer']
    buffer_size = config['buffer_size']
    threshold = config['threshold']
    batch_size = config['batch_size']
    pretrain_episodes = config['pretrain_episodes']
    steps_per_train = config['steps_per_train']
    max_steps = config['max_steps']
    # Probability of using random configuration
    prob_random = config['prob_random']

    epsilon_start = config['epsilon_start']
    epsilon_end = config['epsilon_end']
    epsilon_div = config['epsilon_div']
    epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div)

    if experiment == "sumo":
        # ----------- SUMO parameters ---------------- #
        with open('config_sumo_stage%d.json' % stage) as f:
            config_sumo = json.load(f)
        n_agents = config_sumo["n_agents"]
        list_goals_fixed = config_sumo['goal_lane']
        list_routes_fixed = config_sumo['route']
        list_lanes_fixed = config_sumo['lane']
        list_goal_pos = config_sumo['goal_pos']
        list_speeds = config_sumo['speed']
        init_positions = config_sumo['init_position']
        list_id = config_sumo['id']
        list_vtypes = config_sumo['vtypes']
        depart_mean = config_sumo['depart_mean']
        depart_stdev = config_sumo['depart_stdev']
        total_length = config_sumo['total_length']
        total_width = config_sumo['total_width']
        save_threshold = config_sumo['save_threshold']
        map_route_idx = {'route_ramp': 0, 'route_straight': 1}

        sim = sumo_simulator.Simulator(port,
                                       list_id=list_id,
                                       other_lc_mode=0b1000000001,
                                       sublane_res=0.8,
                                       seed=seed)
        for i in range(int(2 / sim.dt)):
            sim.step()
    elif experiment == 'particle':
        with open(config["particle_config"]) as f:
            config_particle = json.load(f)
        n_agents = config_particle['n_agents']
        scenario = scenarios.load(scenario_name + ".py").Scenario()
        world = scenario.make_world(n_agents, config_particle, prob_random)
        env = MultiAgentEnv(world,
                            scenario.reset_world,
                            scenario.reward,
                            scenario.observation,
                            None,
                            scenario.done,
                            max_steps=max_steps)
    elif experiment == 'checkers':
        with open("config_checkers_stage%d.json" % stage) as f:
            config_checkers = json.load(f)
        n_agents = config_checkers['n_agents']
        dimensions = config_checkers['dimensions']
        init = config_checkers['init']
        env = checkers.Checkers(init['n_rows'], init['n_columns'],
                                init['n_obs'], init['agents_r'],
                                init['agents_c'], n_agents, max_steps)

    l_action = dimensions['l_action']
    l_goal = dimensions['l_goal']

    # Create entire computational graph
    # Creation of new trainable variables for new curriculum
    # stage is handled by networks.py, given the stage number
    if use_alg_credit:
        if experiment == 'checkers':
            alg = alg_credit_checkers.Alg(experiment,
                                          dimensions,
                                          stage,
                                          n_agents,
                                          lr_V=lr_V,
                                          lr_Q=lr_Q,
                                          lr_actor=lr_actor,
                                          use_Q_credit=use_Q_credit,
                                          use_V=use_V,
                                          nn=config_checkers['nn'])
        else:
            alg = alg_credit.Alg(experiment,
                                 dimensions,
                                 stage,
                                 n_agents,
                                 lr_V=lr_V,
                                 lr_Q=lr_Q,
                                 lr_actor=lr_actor,
                                 use_Q_credit=use_Q_credit,
                                 use_V=use_V,
                                 nn=config['nn'])
    elif not use_qmix:
        if experiment == 'checkers':
            alg = alg_baseline_checkers.Alg(experiment,
                                            dimensions,
                                            stage,
                                            n_agents,
                                            lr_V=lr_V,
                                            lr_Q=lr_Q,
                                            lr_actor=lr_actor,
                                            use_Q=use_Q,
                                            use_V=use_V,
                                            alpha=alpha,
                                            nn=config_checkers['nn'],
                                            IAC=config['IAC'])
        else:
            alg = alg_baseline.Alg(experiment,
                                   dimensions,
                                   stage,
                                   n_agents,
                                   lr_V=lr_V,
                                   lr_Q=lr_Q,
                                   lr_actor=lr_actor,
                                   use_Q=use_Q,
                                   use_V=use_V,
                                   alpha=alpha,
                                   nn=config['nn'],
                                   IAC=config['IAC'])
    else:
        print("Using QMIX")
        if experiment == 'checkers':
            alg = alg_qmix_checkers.Alg(experiment,
                                        dimensions,
                                        stage,
                                        n_agents,
                                        lr_Q=lr_Q,
                                        nn=config_checkers['nn'])
        else:
            alg = alg_qmix.Alg(experiment,
                               dimensions,
                               stage,
                               n_agents,
                               lr_Q=lr_Q)

    print("Initialized computational graph")

    list_variables = tf.trainable_variables()
    if stage == 1 or restore_same_stage or train_from_nothing:
        saver = tf.train.Saver()
    elif stage == 2:
        # to_restore = [v for v in list_variables if ('stage-%d'%stage not in v.name.split('/') and 'Policy_target' not in v.name.split('/'))]
        to_restore = []
        for v in list_variables:
            list_split = v.name.split('/')
            if ('stage-%d' % stage not in list_split
                ) and ('Policy_target' not in list_split) and (
                    'Q_credit_main' not in list_split) and ('Q_credit_target'
                                                            not in list_split):
                to_restore.append(v)
        saver = tf.train.Saver(to_restore)
    else:
        # restore only those variables that were not
        # just created at this curriculum stage
        to_restore = [
            v for v in list_variables
            if 'stage-%d' % stage not in v.name.split('/')
        ]
        saver = tf.train.Saver(to_restore)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    tf.set_random_seed(seed)
    sess = tf.Session(config=config)

    writer = tf.summary.FileWriter('../saved/%s' % dir_name, sess.graph)

    sess.run(tf.global_variables_initializer())
    print("Initialized variables")

    if train_from_nothing == 0:
        print("Restoring variables from %s" % dir_restore)
        saver.restore(sess, '../saved/%s/%s' % (dir_restore, model_name))
        if stage == 2 and use_alg_credit and use_Q_credit:
            # Copy weights of Q_global to Q_credit at the start of Stage 2
            sess.run(alg.list_initialize_credit_ops)
            for var in list_variables:
                if var.name == 'Q_global_main/Q_branch1/kernel:0':
                    print("Q_global")
                    print(sess.run(var))
                    print("")
                if var.name == 'Q_credit_main/Q_branch1/kernel:0':
                    print("Q_credit")
                    print(sess.run(var))
                    print("")

    # initialize target networks to equal main networks
    sess.run(alg.list_initialize_target_ops)

    # save everything without exclusion
    saver = tf.train.Saver(max_to_keep=None)

    epsilon = epsilon_start
    # For computing average over 100 episodes
    reward_local_century = np.zeros(n_agents)
    reward_global_century = 0

    # Write log headers
    header = "Episode,r_global"
    header_c = "Century,r_global_avg"
    for idx in range(n_agents):
        header += ',r_%d' % idx
        header_c += ',r_avg_%d' % idx
    header_c += ",r_global_eval"
    for idx in range(n_agents):
        header_c += ',r_eval_%d' % idx

    if experiment == 'sumo':
        for idx in range(n_agents):
            header += ',route_%d,lane_%d,goal_%d' % (idx, idx, idx)
    header_c += ',r_eval_local,duration (s)'
    header += '\n'
    header_c += '\n'
    if not os.path.exists('../log/%s' % dir_name):
        os.makedirs('../log/%s' % dir_name)
    with open('../log/%s/log.csv' % dir_name, 'w') as f:
        f.write(header)
    with open('../log/%s/log_century.csv' % dir_name, 'w') as f:
        f.write(header_c)

    if dual_buffer:
        buf = replay_buffer_dual.Replay_Buffer(size=buffer_size)
    else:
        buf = replay_buffer.Replay_Buffer(size=buffer_size)

    t_start = time.time()

    dist_action = np.zeros(l_action)
    step = 0
    # Each iteration is a training episode
    for idx_episode in range(1, N_train + 1):
        # print("Episode", idx_episode)
        if experiment == "sumo":
            t_ms = sim.traci.simulation.getCurrentTime()
            # SUMO time functions return negative values afer 24 days (in millisecond) of simulation time
            # Hence use 0 for departure time, essentially triggering an immediate departure
            if 0 < t_ms and t_ms < 2073600e3:
                depart_times = [
                    np.random.normal(t_ms / 1000.0 + depart_mean[idx],
                                     depart_stdev) for idx in range(n_agents)
                ]
            else:
                depart_times = [0 for idx in range(n_agents)]

            # Goals for input to policy and value function
            goals = np.zeros([n_agents, l_goal])
            list_routes = ['route_straight'] * n_agents
            list_lanes = [0] * n_agents
            list_goal_lane = [0] * n_agents
            rand_num = random.random()
            if rand_num < prob_random:
                # Random settings for route, lane and goal
                init = 'Random'
                for idx in range(n_agents):
                    route = 'route_straight'
                    lane = np.random.choice([0, 1, 2, 3], p=np.ones(4) * 0.25)
                    goal_lane = np.random.choice(np.arange(l_goal),
                                                 p=np.ones(l_goal) /
                                                 float(l_goal))
                    list_routes[idx] = route
                    list_lanes[idx] = lane
                    list_goal_lane[idx] = goal_lane
                    goals[idx, goal_lane] = 1
            else:
                init = 'Preset'
                # Use predetermined values for route, lane, goal
                for idx in range(n_agents):
                    list_routes[idx] = list_routes_fixed[idx]
                    goal_lane = list_goals_fixed[idx]
                    list_goal_lane[idx] = goal_lane
                    list_lanes[idx] = list_lanes_fixed[idx]
                    goals[idx, goal_lane] = 1

            env = multicar_simple.Multicar(sim,
                                           n_agents,
                                           list_goal_lane,
                                           list_goal_pos,
                                           list_routes,
                                           list_speeds,
                                           list_lanes,
                                           init_positions,
                                           list_id,
                                           list_vtypes,
                                           depart_times,
                                           total_length=total_length,
                                           total_width=total_width,
                                           safety=True)
            global_state, local_others, local_self, done = env.reset()
        elif experiment == "particle":
            global_state, local_others, local_self, done = env.reset()
            goals = np.zeros([n_agents, l_goal])
            for idx in range(n_agents):
                goals[idx] = env.world.landmarks[idx].state.p_pos
        elif experiment == "checkers":
            if n_agents == 1:
                if np.random.randint(2) == 0:
                    goals = np.array([[1, 0]])
                else:
                    goals = np.array([[0, 1]])
            else:
                goals = np.eye(n_agents)
            global_state, local_others, local_self_t, local_self_v, done = env.reset(
                goals)
            actions_prev = np.zeros(n_agents, dtype=np.int)

        reward_global = 0
        reward_local = np.zeros(n_agents)

        # step = 0
        summarized = False
        if dual_buffer:
            buf_episode = []
        while not done:

            if idx_episode < pretrain_episodes and (stage == 1 or
                                                    train_from_nothing == 1):
                # Random actions when filling replay buffer
                actions = np.random.randint(0, l_action, n_agents)
            else:
                # Run actor network for all agents as batch
                if experiment == 'checkers':
                    actions = alg.run_actor(actions_prev, local_others,
                                            local_self_t, local_self_v, goals,
                                            epsilon, sess)
                else:
                    actions = alg.run_actor(local_others, local_self, goals,
                                            epsilon, sess)

            dist_action[actions[0]] += 1
            if experiment == 'sumo':
                # check feasible actions
                actions = env.check_actions(actions)

            # step environment
            if experiment == 'checkers':
                next_global_state, next_local_others, next_local_self_t, next_local_self_v, reward, local_rewards, done = env.step(
                    actions)
            else:
                next_global_state, next_local_others, next_local_self, reward, local_rewards, done = env.step(
                    actions)

            step += 1

            # store transition into memory
            if dual_buffer:
                if experiment == 'checkers':
                    buf_episode.append(
                        np.array([
                            global_state[0], global_state[1],
                            np.array(local_others),
                            np.array(local_self_t),
                            np.array(local_self_v), actions_prev, actions,
                            reward, local_rewards, next_global_state[0],
                            next_global_state[1],
                            np.array(next_local_others),
                            np.array(next_local_self_t),
                            np.array(next_local_self_v), done, goals
                        ]))
                else:
                    buf_episode.append(
                        np.array([
                            global_state,
                            np.array(local_others),
                            np.array(local_self), actions, reward,
                            local_rewards, next_global_state,
                            np.array(next_local_others),
                            np.array(next_local_self), done, goals
                        ]))
            else:
                if experiment == 'checkers':
                    buf.add(
                        np.array([
                            global_state[0], global_state[1],
                            np.array(local_others),
                            np.array(local_self_t),
                            np.array(local_self_v), actions_prev, actions,
                            reward, local_rewards, next_global_state[0],
                            next_global_state[1],
                            np.array(next_local_others),
                            np.array(next_local_self_t),
                            np.array(next_local_self_v), done, goals
                        ]))
                else:
                    buf.add(
                        np.array([
                            global_state,
                            np.array(local_others),
                            np.array(local_self), actions, reward,
                            local_rewards, next_global_state,
                            np.array(next_local_others),
                            np.array(next_local_self), done, goals
                        ]))

            if (idx_episode >= pretrain_episodes) and (step % steps_per_train
                                                       == 0):
                # Sample batch of transitions from replay buffer
                batch = buf.sample_batch(batch_size)

                if summarize and idx_episode % period == 0 and not summarized:
                    # Write TF summary every <period> episodes,
                    # at the first <steps_per_train> step
                    alg.train_step(sess,
                                   batch,
                                   epsilon,
                                   idx_episode,
                                   summarize=True,
                                   writer=writer)
                    summarized = True
                else:
                    alg.train_step(sess,
                                   batch,
                                   epsilon,
                                   idx_episode,
                                   summarize=False,
                                   writer=None)

            global_state = next_global_state
            local_others = next_local_others
            if experiment == 'checkers':
                local_self_t = next_local_self_t
                local_self_v = next_local_self_v
                actions_prev = actions
            else:
                local_self = next_local_self

            reward_local += local_rewards
            reward_global += reward

        if dual_buffer:
            if experiment == 'sumo':
                buf.add(buf_episode, np.sum(reward_local) < threshold)
            elif experiment == 'particle':
                buf.add(buf_episode, scenario.collisions != 0)

        if idx_episode >= pretrain_episodes and epsilon > epsilon_end:
            epsilon -= epsilon_step

        reward_local_century += reward_local
        reward_global_century += reward_global

        # ----------- Log performance --------------- #

        if idx_episode % period == 0:
            dist_action = dist_action / np.sum(dist_action)
            t_end = time.time()
            print("\n Evaluating")
            if experiment == 'sumo':
                r_local_eval, r_global_eval = evaluate.test(
                    N_eval, sim, sess, depart_mean, depart_stdev, n_agents,
                    l_goal, list_routes_fixed, list_lanes_fixed,
                    list_goals_fixed, prob_random, list_goal_pos, list_speeds,
                    init_positions, list_id, list_vtypes, alg)
                if np.all(r_local_eval > save_threshold):
                    saver.save(
                        sess, '../saved/%s/model_good_%d.ckpt' %
                        (dir_name, idx_episode))
            elif experiment == 'particle':
                r_local_eval, r_global_eval = evaluate.test_particle(
                    N_eval, env, sess, n_agents, l_goal, alg, render=False)
            elif experiment == 'checkers':
                r_local_eval, r_global_eval = evaluate.test_checkers(
                    N_eval, env, sess, n_agents, alg)
                if stage == 1 and np.sum(r_local_eval) > 9.0:
                    saver.save(
                        sess, '../saved/%s/model_good_%d.ckpt' %
                        (dir_name, idx_episode))
            s = '%d,%.2f,' % (idx_episode,
                              reward_global_century / float(period))
            s += ','.join([
                '{:.2f}'.format(val / float(period))
                for val in reward_local_century
            ])
            s += ',%.2f,' % (r_global_eval)
            s += ','.join(['{:.2f}'.format(val) for val in r_local_eval])
            s += ',%.2f,%d' % (np.sum(r_local_eval), int(t_end - t_start))
            s += '\n'
            print(s)
            with open('../log/%s/log_century.csv' % dir_name, 'a') as f:
                f.write(s)
            reward_local_century = np.zeros(n_agents)
            reward_global_century = 0
            print("Action distribution ", dist_action)
            if dual_buffer:
                print(
                    "length buffer good %d, length buffer others %d, epsilon %.3f"
                    % (len(buf.memory_2), len(buf.memory_1), epsilon))
            else:
                print("epsilon %.3f" % epsilon)
            dist_action = np.zeros(l_action)

            t_start = time.time()

        s = '%d,%.2f,' % (idx_episode, reward_global)
        s += ','.join(['{:.2f}'.format(val) for val in reward_local])
        if experiment == 'sumo':
            for idx in range(n_agents):
                s += ',%d,%d,%d' % (map_route_idx[list_routes[idx]],
                                    list_lanes[idx], list_goal_lane[idx])
        s += '\n'
        with open('../log/%s/log.csv' % dir_name, 'a') as f:
            f.write(s)

    print("Saving stage %d variables" % stage)
    if not os.path.exists('../saved/%s' % dir_name):
        os.makedirs('../saved/%s' % dir_name)
    saver.save(sess, '../saved/%s/model_final.ckpt' % dir_name)
Exemplo n.º 26
0
def make_env(scenario_name,
             local_observation=True,
             benchmark=False,
             discrete_action=True):
    '''
    Creates a MultiAgentEnv object as env. This can be used similar to a gym
    environment by calling env.reset() and env.step().
    Use env.render() to view the environment on the screen.

    Input:
        scenario_name   :   name of the scenario from ./scenarios/ to be Returns
                            (without the .py extension)
        benchmark       :   whether you want to produce benchmarking data
                            (usually only done during evaluation)

    Some useful env properties (see environment.py):
        .observation_space  :   Returns the observation space for each agent
        .action_space       :   Returns the action space for each agent
        .n                  :   Returns the number of Agents
    
    simple_spread
    simple_reference
    simple_speaker_listener
    collect_treasure
    multi_speaker_listener

    '''
    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    if local_observation:
        if scenario_name == 'simple_spread':
            scenario.observation = local_obs_simple_spread.__get__(scenario)
        elif scenario_name == 'simple_reference':
            scenario.observation = local_obs_simple_reference.__get__(scenario)
        elif scenario_name == 'simple_speaker_listener':
            scenario.observation = local_obs_simple_speaker_listener.__get__(
                scenario)
        elif scenario_name == 'multi_speaker_listener':
            # scenario.observation = local_obs_multi_speaker_listener.__get__(scenario)
            print('origin')
        elif scenario_name == 'fullobs_collect_treasure':
            scenario.observation = local_obs_collect_treasure.__get__(scenario)
        else:
            print('error: unsupported scenario!')

    # create world
    world = scenario.make_world()
    world.collaborative = False  # to get individual reward

    # create multiagent environment
    if hasattr(scenario, 'post_step'):
        post_step = scenario.post_step
    else:
        post_step = None
    if benchmark:
        env = MultiAgentEnv(world,
                            reset_callback=scenario.reset_world,
                            reward_callback=scenario.reward,
                            observation_callback=scenario.observation,
                            post_step_callback=post_step,
                            info_callback=scenario.benchmark_data,
                            discrete_action=discrete_action)
    else:
        env = MultiAgentEnv(world,
                            reset_callback=scenario.reset_world,
                            reward_callback=scenario.reward,
                            observation_callback=scenario.observation,
                            post_step_callback=post_step,
                            discrete_action=discrete_action)
    env.force_discrete_action = True
    return env
Exemplo n.º 27
0
            AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2),
            AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2),
            AgentSpec(type=0), AgentSpec(type=1), AgentSpec(type=2)
        ],
        reward_type=reward_type,
        reward_type_string=visualization._type_string,
        shuffle_on_reset=True
    )

    scenario = Scenario()
    world = scenario.make_world(env_spec)
    env = MultiAgentEnv(
        world,
        env_spec,
        scenario.reset_world,
        scenario.reward,
        scenario.observation,
        info_callback=None,
        done_callback=scenario.done,
        shared_viewer=True
    )
    # env.viewers[0].cam_range = env_spec.zoom

    """
    # interactive policy test
    policies = [InteractivePolicy(env, i) for i in range(env.n)]
    obs_n = env.reset()
    world.agents[0].state.p_pos = [0., 0.]
    while True:
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
Exemplo n.º 28
0
import pdb

if __name__ == '__main__':
    import multiagent.scenarios as scenarios
    scenario = scenarios.load(
        "multiagent-particle-envs/multiagent/scenarios/simple_tag.py"
    ).Scenario()
    world = scenario.make_world()
    from multiagent.environment import MultiAgentEnv
    env = MultiAgentEnv(world, scenario.reset_world, scenario.reward,
                        scenario.observation)

    n_agents = env.n
    dim_act = world.dim_p * 2 + 1
    obs = env.reset()
    n_states = len(obs[0])

    n_episode = 20000
    max_steps = 1200
    from maddpg import *
    maddpg = MADDPG(n_agents, n_states, dim_act)

    for i_episode in range(n_episode):
        obs = env.reset()
        obs = np.stack(obs)
        max_steps = 1200
        total_reward = 0
        adversaries_reward = 0
        goodagent_reward = 0
        for t in range(max_steps):
            actions = maddpg.produce_action(obs)
Exemplo n.º 29
0
    # parse arguments
    parser = argparse.ArgumentParser(description=None)
    parser.add_argument('-s',
                        '--scenario',
                        default='simple.py',
                        help='Path of the scenario Python script.')
    args = parser.parse_args()

    # load scenario from script
    scenario = scenarios.load(args.scenario).Scenario()
    # create world
    world = scenario.make_world()  # world is a class of world
    # create multiagent environment
    env = MultiAgentEnv(world,
                        scenario.reset_world,
                        scenario.reward,
                        scenario.observation,
                        info_callback=None,
                        shared_viewer=False)
    # render call to create viewer window (necessary only for interactive policies)
    env.render()
    # create interactive policies for each agent
    policies = [InteractivePolicy(env, i) for i in range(env.n)
                ]  # create for each agent in the simulation world
    # execution loop
    obs_n = env.reset()
    while True:
        # query for action from each agent's policy
        act_n = []
        for i, policy in enumerate(policies):
            act_n.append(policy.action(obs_n[i]))
        # step environment
Exemplo n.º 30
0
        dropout = tf.layers.dropout(
               inputs=dense, rate=0.4, training=True)
        context = tf.layers.dense(inputs=dropout, units=num_outputs)
        return(context)

def make_env(scenario_name, arglist, benchmark=False):
    from multiagent.environment import MultiAgentEnv
    import multiagent.scenarios as scenarios

    # load scenario from script
    scenario = scenarios.load(scenario_name + ".py").Scenario()
    # create world
    world = scenario.make_world()
    # create multiagent environment
    if benchmark:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, scenario.benchmark_data)
    else:
        env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation)
    return env

def get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist):
    trainers = []
    model = mlp_model
    map_model=CNN_model
    trainer = MADDPGAgentTrainer
    for i in range(num_adversaries):
        trainers.append(trainer(
            "agent_%d" % i, model, map_model,obs_shape_n, obs_map_shape_n, env.action_space,i, arglist,
            local_q_func=(arglist.adv_policy=='ddpg')))
    for i in range(num_adversaries, env.n):
        trainers.append(trainer(
Exemplo n.º 31
0
def train(scenario):
    path_to_save = 'models/' + scenario.__module__.split('.')[-1] + '/ddpg'
    train_n = 1
    if not os.path.exists(path_to_save):
        os.makedirs(path_to_save)
    world = scenario.make_world()
    env = MultiAgentEnv(
        world,
        reset_callback=scenario.reset_world,
        reward_callback=scenario.reward,
        observation_callback=scenario.observation,
        info_callback=None,
        done_callback=scenario.done,
        collision_callback=scenario.is_collision,
        shared_viewer=True,
    )
    evaluator = evaluate_models.Evaluator(args,
                                          scenario,
                                          save=scenario.name + '/' +
                                          str(train_n))

    with U.single_threaded_session() as sess:
        simple_agents = [StayAgent(env, 1)]  #good agent
        agents_with_nn = [
            DDPGAgent(
                env,
                0,
                sess,
                batch_size=args.batch_size,
                memory_size=args.memory_size,
                noise_type=args.noise_type,
                # good agent
                actor_lr=args.actor_lr,
                critic_lr=args.critic_lr,
                layer_norm=True,
                nb_layers=args.nb_layers,
                nb_neurons=args.nb_neurons)
        ]
        policies = [agents_with_nn[0], simple_agents[0]]
        print('agents is created')

        # for agent in agents_with_nn:
        #     agent.agent.initialize(sess)

        saver = tf.train.Saver()
        if args.load_weights:
            saver.restore(sess, 'models/' + scenario.name + '/ddpg/model')
        sess.graph.finalize()
        # for agent in agents_with_nn:
        #     agent.agent.reset()
        statistics_header = ["episode"]
        statistics_header.append("steps")
        statistics_header.extend(["reward_{}".format(i) for i in range(env.n)])
        statistics_header.extend(["q_{}".format(i) for i in range(env.n)])
        statistics = utilities.Time_Series_Statistics_Store(statistics_header)

        for episode in range(args.episodes):
            if episode % 500 == 0:
                print('episode ' + str(episode))
            # reset
            for agent in policies:
                agent.reset()
            states = env.reset()

            step = 0
            while True:
                episode_q = np.zeros(env.n)
                episode_rewards = np.zeros(env.n)
                step += 1
                env_done = False
                # choose actions
                if args.render:
                    env.render()
                actions = [None for _ in range(len(world.policy_agents))]
                for agent in simple_agents:
                    actions[agent.agent_index] = (agent.action(
                        states[agent.agent_index]))
                    episode_q[0] += 0
                for agent in agents_with_nn:
                    action, q = agent.action(states[agent.agent_index],
                                             apply_noise=True,
                                             compute_Q=True)
                    actions[agent.agent_index] = action
                    episode_q[agent.agent_index] += q

                # step
                states_next, rewards, done, info = env.step(actions)
                episode_rewards += rewards

                # save to memory
                # print(rewards)
                for agent in agents_with_nn:
                    agent.agent.store_transition(
                        states[agent.agent_index], actions[agent.agent_index],
                        rewards[agent.agent_index],
                        states_next[agent.agent_index],
                        done[agent.agent_index])

                if step >= args.max_steps:
                    env_done = True
                for agent in agents_with_nn:
                    if done[agent.agent_index]:
                        env_done = True

                states = states_next
                if env_done:
                    episode_rewards = episode_rewards / step
                    episode_losses = episode_q / step
                    statistic = [episode]
                    statistic.append(step)
                    statistic.extend(
                        [episode_rewards[i] for i in range(env.n)])
                    statistic.extend([episode_q[i] for i in range(env.n)])
                    statistics.add_statistics(statistic)
                    break

            # learn
            # Adapt param noise, if necessary.
            for t_train in range(args.nb_train_steps):
                for agent in agents_with_nn:
                    if agent.agent.memory.nb_entries >= args.batch_size:
                        if episode % args.param_noise_adaption_interval == 0:
                            distance = agent.agent.adapt_param_noise()
                        # print('train')
                        cl, al = agent.agent.train()
                        agent.agent.update_target_net()

            if episode % args.save_every_n_episodes == 0:
                saver.save(
                    sess, 'models/' + scenario.__module__.split('.')[-1] +
                    '/ddpg/model')

            if args.evaluate_every_n_episodes != 0 and episode % args.evaluate_every_n_episodes == 0:
                statistics.dump("{}_{}.csv".format(
                    args.experiment_prefix +
                    scenario.__module__.split('.')[-1], episode))
                evaluator.evaluate(env, policies, episode)

        saver.save(
            sess,
            'models/' + scenario.__module__.split('.')[-1] + '/ddpg/model')