示例#1
0
def run():
    # Create environment
    env, state_dim, action_dim, max_steps = make_env(env_params=Namespace(
        **env_params))
    env_eval, state_dim, action_dim, max_steps = make_env(env_params=Namespace(
        **env_params))
    # Create agent trainers
    # obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
    # num_adversaries = min(env.n, arglist.num_adversaries)

    obs_shape_n = state_dim
    act_shape_n = action_dim

    maa2c = MAA2C(env,
                  env_params['n_agents'],
                  obs_shape_n,
                  act_shape_n,
                  max_steps=max_steps)

    episodes = []
    eval_rewards = []
    while maa2c.n_episodes < MAX_EPISODES:
        # print(maa2c.env_state)
        maa2c.interact()
        # if maa2c.n_episodes >= EPISODES_BEFORE_TRAIN:
        #     maa2c.train()
        maa2c.train()
        if maa2c.episode_done and ((maa2c.n_episodes) % EVAL_INTERVAL == 0):
            rewards, _ = maa2c.evaluation(env_eval, EVAL_EPISODES)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode %d, Average Reward %.2f, STD %.2f" %
                  (maa2c.n_episodes, rewards_mu, rewards_std))
            episodes.append(maa2c.n_episodes)
            eval_rewards.append(rewards_mu)
示例#2
0
def run(env_id="AttFC_GyroErr-MotorVel_M4_Con-v0"):
    env = gym.make(env_id)
    env = RewScale(env, 0.1)
    env.seed(RANDOM_SEED)
    env_eval = gym.make(env_id)
    env_eval = RewScale(env_eval, 0.1)
    env_eval.seed(RANDOM_SEED)
    state_dim = env.observation_space.shape[0]

    if len(env.action_space.shape) > 1:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.shape[0]

    ppo = PPO(env=env,
              memory_capacity=MEMORY_CAPACITY,
              state_dim=state_dim,
              action_dim=action_dim,
              batch_size=BATCH_SIZE,
              entropy_reg=ENTROPY_REG,
              done_penalty=DONE_PENALTY,
              roll_out_n_steps=ROLL_OUT_N_STEPS,
              target_update_steps=TARGET_UPDATE_STEPS,
              target_tau=TARGET_TAU,
              reward_gamma=REWARD_DISCOUNTED_GAMMA,
              epsilon_start=EPSILON_START,
              epsilon_end=EPSILON_END,
              epsilon_decay=EPSILON_DECAY,
              max_grad_norm=MAX_GRAD_NORM,
              episodes_before_train=EPISODES_BEFORE_TRAIN,
              critic_loss=CRITIC_LOSS)

    episodes = []
    eval_rewards = []
    while ppo.n_episodes < MAX_EPISODES:
        ppo.interact()
        if ppo.n_episodes >= EPISODES_BEFORE_TRAIN:
            ppo.train()
        if ppo.episode_done and ((ppo.n_episodes + 1) % EVAL_INTERVAL == 0):
            rewards, _ = ppo.evaluation(env_eval, EVAL_EPISODES)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode %d, Average Reward %.2f" %
                  (ppo.n_episodes + 1, rewards_mu))
            episodes.append(ppo.n_episodes + 1)
            eval_rewards.append(rewards_mu)

    episodes = np.array(episodes)
    eval_rewards = np.array(eval_rewards)
    np.savetxt("./output/%s_ppo_episodes.txt" % env_id, episodes)
    np.savetxt("./output/%s_ppo_eval_rewards.txt" % env_id, eval_rewards)

    plt.figure()
    plt.plot(episodes, eval_rewards)
    plt.title("%s" % env_id)
    plt.xlabel("Episode")
    plt.ylabel("Average Reward")
    plt.legend(["PPO"])
    plt.savefig("./output/%s_ppo.png" % env_id)
示例#3
0
    def test_final(self, actor_weight_file, critic_weight_file):
        # Evaluate the performance of your agent over 100 episodes, by calculating cummulative rewards for the 100 episodes.
        # Here you need to interact with the environment, irrespective of whether you are using a memory.

        # Load the weights
        self.a2c.load_weights(actor_weight_file, critic_weight_file)

        # Setting the env configuration
        if (environment_name == 'cp-v0'):
            self.env.env.my_init(self.G * 10.5, self.MC * 0.9, self.MP * 2.5,
                                 self.L * 1.5, self.F * 2.5)
        elif (environment_name == 'Bipedal-v0'):
            self.env.env.my_init(self.F * 2.5)

        # mini train
        num_minitrain_episodes = 10
        while self.a2c.n_episodes < 10:
            self.a2c.interact()
            self.a2c.train()

        episodes = []
        eval_rewards = []
        num_episodes = 40 + num_minitrain_episodes

        while self.a2c.n_episodes < num_episodes:
            self.a2c.interact()
            rewards, _ = self.a2c.evaluation(self.env_eval, 1)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode %d, Average Reward %.2f" %
                  (self.a2c.n_episodes + 1, rewards_mu))
            episodes.append(self.a2c.n_episodes + 1)
            eval_rewards.append(rewards_mu)

        episodes = np.array(episodes)
        eval_rewards = np.array(eval_rewards)

        # Print mean and std.dev
        mean_reward = np.mean(eval_rewards)
        stddev_reward = np.std(eval_rewards)
        print("Mean Reward:{}\n Std. dev:{}".format(mean_reward,
                                                    stddev_reward))

        # Save the plot
        base_path = os.path.join(self.environment_name, 'a2c_plot_test')
        if not os.path.exists(base_path):
            os.makedirs(base_path)
        file_name = os.path.join(base_path, 'Average_reward.png')

        plt.figure()
        plt.plot(episodes, eval_rewards)
        plt.title("%s" % self.environment_name)
        plt.xlabel("Episode")
        plt.ylabel("Average Reward")
        plt.legend(["A2C"])
        plt.show()
        plt.savefig(file_name)
示例#4
0
def run(env_id="CartPole-v0"):

    env = gym.make(env_id)
    env.seed(RANDOM_SEED)
    env_eval = gym.make(env_id)
    env_eval.seed(RANDOM_SEED)
    state_dim = env.observation_space.shape[0]
    if len(env.action_space.shape) > 1:
        action_dim = env.action_space.shape[0]
    else:
        action_dim = env.action_space.n

    dqn = DQN(env=env,
              memory_capacity=MEMORY_CAPACITY,
              state_dim=state_dim,
              action_dim=action_dim,
              batch_size=BATCH_SIZE,
              max_steps=MAX_STEPS,
              done_penalty=DONE_PENALTY,
              critic_loss=CRITIC_LOSS,
              reward_gamma=REWARD_DISCOUNTED_GAMMA,
              epsilon_start=EPSILON_START,
              epsilon_end=EPSILON_END,
              epsilon_decay=EPSILON_DECAY,
              max_grad_norm=MAX_GRAD_NORM,
              episodes_before_train=EPISODES_BEFORE_TRAIN)

    episodes = []
    eval_rewards = []
    while dqn.n_episodes < MAX_EPISODES:
        dqn.interact()
        if dqn.n_episodes >= EPISODES_BEFORE_TRAIN:
            dqn.train()
        if dqn.episode_done and ((dqn.n_episodes + 1) % EVAL_INTERVAL == 0):
            rewards, _ = dqn.evaluation(env_eval, EVAL_EPISODES)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode %d, Average Reward %.2f" %
                  (dqn.n_episodes + 1, rewards_mu))
            episodes.append(dqn.n_episodes + 1)
            eval_rewards.append(rewards_mu)

    episodes = np.array(episodes)
    eval_rewards = np.array(eval_rewards)
    np.savetxt("./output/%s_dqn_episodes.txt" % env_id, episodes)
    np.savetxt("./output/%s_dqn_eval_rewards.txt" % env_id, eval_rewards)

    plt.figure()
    plt.plot(episodes, eval_rewards)
    plt.title("%s" % env_id)
    plt.xlabel("Episode")
    plt.ylabel("Average Reward")
    plt.legend(["DQN"])
    plt.savefig("./output/%s_dqn.png" % env_id)
示例#5
0
def run(env_id="Pendulum-v0"):

    env = gym.make(env_id)
    env.seed(RANDOM_SEED)
    env_eval = gym.make(env_id)
    env_eval.seed(RANDOM_SEED)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    ddpg = DDPG(env=env,
                memory_capacity=MEMORY_CAPACITY,
                state_dim=state_dim,
                action_dim=action_dim,
                batch_size=BATCH_SIZE,
                max_steps=MAX_STEPS,
                done_penalty=DONE_PENALTY,
                target_update_steps=TARGET_UPDATE_STEPS,
                target_tau=TARGET_TAU,
                reward_gamma=REWARD_DISCOUNTED_GAMMA,
                critic_loss=CRITIC_LOSS,
                epsilon_start=EPSILON_START,
                epsilon_end=EPSILON_END,
                epsilon_decay=EPSILON_DECAY,
                max_grad_norm=MAX_GRAD_NORM,
                episodes_before_train=EPISODES_BEFORE_TRAIN)

    episodes = []
    eval_rewards = []
    while ddpg.n_episodes < MAX_EPISODES:
        ddpg.interact()
        if ddpg.n_episodes >= EPISODES_BEFORE_TRAIN:
            ddpg.train()
        if ddpg.episode_done and ((ddpg.n_episodes + 1) % EVAL_INTERVAL == 0):
            rewards, _ = ddpg.evaluation(env_eval, EVAL_EPISODES)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode: %d, Average Reward: %.5f" %
                  (ddpg.n_episodes + 1, rewards_mu))
            episodes.append(ddpg.n_episodes + 1)
            eval_rewards.append(rewards_mu)

    episodes = np.array(episodes)
    eval_rewards = np.array(eval_rewards)
    np.savetxt("./output/%s_ddpg_episodes.txt" % env_id, episodes)
    np.savetxt("./output/%s_ddpg_eval_rewards.txt" % env_id, eval_rewards)

    plt.figure()
    plt.plot(episodes, eval_rewards)
    plt.xlabel("Episode")
    plt.ylabel("Average Reward")
    plt.legend(["DDPG"])
    plt.savefig("./output/%s_ddpg.png" % env_id)
示例#6
0
    def train(self, render=1):
        # In this function, we will train our network.
        # If training without experience replay_memory, then you will interact with the environment
        # in this function, while also updating your network parameters.

        # If you are using a replay memory, you should interact with environment here, and store these
        # transitions to memory, while also updating your model.

        # Variables init

        # Burn in memory

        episodes = []
        eval_rewards = []

        while self.a2c.n_episodes < self.max_episodes:
            self.a2c.interact()
            if self.a2c.n_episodes >= self.episodes_before_train:
                self.a2c.train()
            if self.a2c.episode_done and ((self.a2c.n_episodes + 1) %
                                          self.eval_iterval == 0):
                rewards, _ = self.a2c.evaluation(self.env_eval,
                                                 self.eval_episodes)
                rewards_mu, rewards_std = agg_double_list(rewards)
                print("Episode %d, Average Reward %.2f" %
                      (self.a2c.n_episodes + 1, rewards_mu))
                episodes.append(self.a2c.n_episodes + 1)
                eval_rewards.append(rewards_mu)

                # Save the weights
                print("=> Saving weights after {} episodes".format(
                    self.a2c.n_episodes + 1))
                self.a2c.save_weights(self.environment_name,
                                      self.a2c.n_episodes + 1)

        episodes = np.array(episodes)
        eval_rewards = np.array(eval_rewards)

        # Save the plot
        base_path = os.path.join(self.environment_name, 'a2c_plot_eval')
        if not os.path.exists(base_path):
            os.makedirs(base_path)
        file_name = os.path.join(base_path, 'Average_reward.png')

        plt.figure()
        plt.plot(episodes, eval_rewards)
        plt.title("%s" % self.environment_name)
        plt.xlabel("Episode")
        plt.ylabel("Average Reward")
        plt.legend(["A2C"])
        plt.savefig(file_name)
示例#7
0
def run(arglist):
    # Create environment
    env = make_env(arglist.scenario, arglist, arglist.benchmark)
    env_eval = make_env(arglist.scenario, arglist, arglist.benchmark)
    # Create agent trainers
    # obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
    num_adversaries = min(env.n, arglist.num_adversaries)

    obs_shape_n = [env.observation_space[i].shape[0] for i in range(env.n)]
    act_shape_n = [env.action_space[i].n for i in range(env.n)]

    maa2c = MAA2C(env, env.n, obs_shape_n, act_shape_n)
    # maa2c_eval = MAA2C(env_eval, env_eval.n, obs_shape_n, act_shape_n)
    # trainers = get_trainers(env)

    # env = gym.make(env_id)
    # env.seed(RANDOM_SEED)
    # env_eval = gym.make(env_id)
    # env_eval.seed(RANDOM_SEED)
    # state_dim = env.observation_space.shape[0]
    # if len(env.action_space.shape) > 1:
    #     action_dim = env.action_space.shape[0]
    # else:
    #     action_dim = env.action_space.n

    # a2c = A2C(env=env, memory_capacity=MEMORY_CAPACITY,
    #           state_dim=state_dim, action_dim=action_dim,
    #           batch_size=BATCH_SIZE, entropy_reg=ENTROPY_REG,
    #           done_penalty=DONE_PENALTY, roll_out_n_steps=ROLL_OUT_N_STEPS,
    #           reward_gamma=REWARD_DISCOUNTED_GAMMA,
    #           epsilon_start=EPSILON_START, epsilon_end=EPSILON_END,
    #           epsilon_decay=EPSILON_DECAY, max_grad_norm=MAX_GRAD_NORM,
    #           episodes_before_train=EPISODES_BEFORE_TRAIN,
    #           critic_loss=CRITIC_LOSS)

    episodes =[]
    eval_rewards =[]
    while maa2c.n_episodes < MAX_EPISODES:
        maa2c.interact()
        # if maa2c.n_episodes >= EPISODES_BEFORE_TRAIN:
        #     maa2c.train()
        maa2c.train()
        if maa2c.episode_done and ((maa2c.n_episodes)%EVAL_INTERVAL == 0):
            rewards, _ = maa2c.evaluation(env_eval, EVAL_EPISODES)
            rewards_mu, rewards_std = agg_double_list(rewards)
            # print(rewards)
            print("Episode %d, Average Reward %.2f, STD %.2f" % (maa2c.n_episodes, rewards_mu, rewards_std))
            episodes.append(maa2c.n_episodes)
            eval_rewards.append(rewards_mu)
示例#8
0
    def train_maml(self, render=1):

        sample_size = 10
        theta_list = []
        K = 1
        num_iterations = 50000
        task_list = []

        plt.figure()

        for i in range(num_iterations * sample_size):
            if (self.environment_name == 'cp-v0'):
                task = {
                    'G':
                    np.random.uniform(self.range[0] * self.G,
                                      self.range[1] * self.G, 1)[0],
                    'MC':
                    np.random.uniform(self.range[0] * self.MC,
                                      self.range[1] * self.MC, 1)[0],
                    'MP':
                    np.random.uniform(self.range[0] * self.MP,
                                      self.range[1] * self.MP, 1)[0],
                    'L':
                    np.random.uniform(self.range[0] * self.L,
                                      self.range[1] * self.L, 1)[0],
                    'F':
                    np.random.uniform(self.range[0] * self.F,
                                      self.range[1] * self.F, 1)[0]
                }
            elif (self.environment_name == 'Bipedal-v0'):
                task = {
                    'F':
                    np.random.uniform(self.range[0] * self.F,
                                      self.range[1] * self.F, 1)[0]
                }

            task_list.append(task)

        num_tasks = len(task_list)

        # Outer loop
        for i in range(num_iterations):
            sample_indexes = np.random.randint(0, num_tasks, size=sample_size)

            # Get the theta
            if i == 0:
                theta_actor_critic = self.a2c.get_weights()

            # Inner loop
            # First gradient
            for j, sample_index in enumerate(sample_indexes):
                task = task_list[sample_index]
                # Set the configuration
                if (self.environment_name == 'cp-v0'):
                    self.env.env.my_init(task['G'], task['MC'], task['MP'],
                                         task['L'], task['F'])
                elif (self.environment_name == 'Bipedal-v0'):
                    self.env.env.my_init(task['F'])

                # Set the model weights to theta before training
                self.a2c.set_weights(theta_actor_critic)

                # Train the a2c network for this task for K episodes
                while self.a2c.n_episodes < K:
                    self.a2c.interact()
                    self.a2c.train()

                if i == 0:
                    theta_list.append(self.a2c.get_weights())
                else:
                    theta_list[j] = self.a2c.get_weights()

            # Second gradiet
            for j, sample_index in enumerate(sample_indexes):
                task = task_list[sample_index]
                # Set the configuration
                if (self.environment_name == 'cp-v0'):
                    self.env.env.my_init(task['G'], task['MC'], task['MP'],
                                         task['L'], task['F'])
                elif (self.environment_name == 'Bipedal-v0'):
                    self.env.env.my_init(task['F'])

                # Set the model weights to theta before training
                self.a2c.set_weights(theta_list[j])

                # Get the network loss for this task for 1 episode
                # TODO: There should be no while loop
                # while self.a2c.n_episodes < 1:
                self.a2c.interact()
                combined_loss = self.a2c.get_loss()

                # Set the model weights to theta
                self.a2c.set_weights(theta_actor_critic)

                # Update theta
                self.a2c.update_net(combined_loss)
                theta_actor_critic = self.a2c.get_weights()

            # Evaluate the network
            self.a2c.interact()
            rewards, _ = self.a2c.evaluation(self.env_eval, 1)
            rewards_mu, rewards_std = agg_double_list(rewards)
            print("Episode %d, Average Reward %.2f" % (i + 1, rewards_mu))

            # Plot iteration vs reward
            plt.scatter(i, rewards_mu)
            #plt.pause(0.0001)

            # Save the weights
            if i % self.save_weight_interval == 0 and i != 0:
                self.a2c.save_weights(self.environment_name, i)
            base_path = os.path.join(self.environment_name, 'a2c_plot_train')
            if not os.path.exists(base_path):
                os.makedirs(base_path)
            file_name = os.path.join(base_path, 'Average_reward_train.png')
            plt.title("%s" % self.environment_name)
            plt.xlabel("Episode")
            plt.ylabel("Average Reward")
            plt.legend(["A2C"])
            plt.savefig(file_name)
示例#9
0
    def test_final(self, actor_weight_file, critic_weight_file):
        # Evaluate the performance of your agent over 100 episodes, by calculating cummulative rewards for the 100 episodes.
        # Here you need to interact with the environment, irrespective of whether you are using a memory.

        episodes = []
        eval_rewards = []

        self.env_fin = gym.make('cp-v0')
        num_episodes = 50
        self.a2c.load_weights(actor_weight_file, critic_weight_file)

        base = np.array([1, 0.5, 2])
        G = np.array([1]) * 9.8
        MC = base * 0.5
        MP = base * 0.1
        L = base * 0.5
        F = base * 10

        fl = open('Experiments.csv', 'w')
        fl.write(
            'List of parameters: Gravity, Mass of Cart, Mass of Pole, Length, Force Magnitude\n'
        )
        fl.write('Output Reward: Mean, Standard Deviation\n')

        for g in G:
            for mc in MC:
                for mp in MP:
                    for l in L:
                        for f in F:
                            self.env_fin.env.my_init(G=g,
                                                     MC=mc,
                                                     MP=mp,
                                                     L=l,
                                                     F=f)
                            for i in range(num_episodes):
                                self.a2c.interact()
                                rewards, _ = self.a2c.evaluation(
                                    self.env_fin, 1)
                                rewards_mu, rewards_std = agg_double_list(
                                    rewards)
                                #print("Episode %d, Average Reward %.2f" %
                                #      (self.a2c.n_episodes+1, rewards_mu))
                                episodes.append(i + 1)
                                eval_rewards.append(rewards_mu)
                            print(g, mc, mp, l, f)
                            rm = float("{0:.2f}".format(np.mean(eval_rewards)))
                            rs = float("{0:.2f}".format(np.std(eval_rewards)))
                            str_cp = str(mc) + '& ' + str(mp) + '& ' + str(
                                l) + '& ' + str(f) + '& '
                            str_cp = str_cp + str(rm) + ' &' + str(rs) + '\n'
                            fl.write(str_cp)
                            print(
                                "Rewards: Mean: %d, Std: %d" %
                                (np.mean(eval_rewards), np.std(eval_rewards)))

        episodes = np.array(episodes)
        eval_rewards = np.array(eval_rewards)

        # Save the plot
        base_path = os.path.join(self.environment_name, 'a2c_plot_test')
        if not os.path.exists(base_path):
            os.makedirs(base_path)
        file_name = os.path.join(base_path, 'Average_reward.png')

        plt.figure()
        plt.plot(episodes, eval_rewards)
        plt.title("%s" % self.environment_name)
        plt.xlabel("Episode")
        plt.ylabel("Average Reward")
        plt.legend(["A2C"])
        plt.savefig(file_name)