Пример #1
0
def train(sess,
          env,
          args,
          actors,
          critics,
          noise,
          ave_n,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6):

    summary_ops, summary_vars = build_summaries(env.n)
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # callbacks = []
    # train_names = ['train_loss', 'train_mae']
    # callback = TensorBoard(args['summary_dir'])

    for actor in actors:
        actor.update_target()
    for critic in critics:
        # callback = TensorBoard(args['summary_dir'])
        # callback.set_model(critic.mainModel)
        # callbacks.append(callback)

        critic.update_target()

    replayMemory = None
    replayMemory_ddpg = None
    # prioritized_replay_beta_iters = None

    if args["prioritized"]:
        replayMemory = PrioritizedReplayMemory(args['buffer_size'],
                                               args["prioritized_alpha"])
        replayMemory_ddpg = ReplayMemory(int(args['buffer_size']),
                                         int(args['random_seed']))
    else:
        replayMemory_ddpg = replayMemory = ReplayMemory(
            int(args['buffer_size']), int(args['random_seed']))
    # Prioritized Replay
    # PrioritizedReplayMemory = PrioritizedReplayMemory(args['buffer_size'])

    for ep in range(int(args['max_episodes'])):

        start = time.time()

        s = env.reset()
        episode_reward = np.zeros((env.n, ))
        #episode_av_max_q = 0

        for stp in range(int(args['max_episode_len'])):

            action_dims_done = 0

            if args['render_env']:
                env.render()

            a = []

            for i in range(env.n):
                actor = actors[i]
                state_input = np.reshape(s[i], (-1, actor.state_dim))
                a.append(
                    actor.act(state_input,
                              noise[i]()).reshape(actor.action_dim, ))

            s2, r, done, _ = env.step(
                a)  # a is a list with each element being an array
            #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,)))
            replayMemory.add(s, a, r, done, s2)
            replayMemory_ddpg.add(s, a, r, done, s2)

            # Prioritized Replay Memory
            # replayMemory.store(s, a, r, done, s2)
            # replayMemory.sample(int(args["minibatch_size"]))
            # update priority with loss

            s = s2

            # MADDPG Adversary Agent
            for i in range(ave_n):

                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['minibatch_size']):

                    s_batch, a_batch, r_batch, d_batch, s2_batch, batch_idxes = None, None, None, None, None, None

                    if args["prioritized"]:
                        experience = replayMemory.sample(
                            args['minibatch_size'])
                        (s_batch, a_batch, r_batch, d_batch, s2_batch,
                         batch_idxes) = experience
                        print(len(batch_idxes))
                    else:
                        s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                            int(args['minibatch_size']))

                    a = []
                    for j in range(ave_n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  #batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))

                    #print(np.asarray(a).shape)
                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    #print("a_for_critic", a_temp.shape)
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([
                        x for x in s2_batch[:, i]
                    ])  # Checked till this point, should be fine.
                    # print("s2_batch_i", s2_batch_i.shape)
                    targetQ = critic.predict_target(
                        s2_batch_i, a_for_critic)  # Should  work, probably

                    yi = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    td_errors = critic.train(
                        s_batch_i,
                        np.asarray(
                            [x.flatten() for x in a_batch[:, 0:ave_n, :]]),
                        np.asarray(yi))

                    if args["prioritized"]:
                        print(td_errors)
                        new_priorities = np.abs(
                            td_errors) + prioritized_replay_eps
                        print(len(new_priorities))
                        replayMemory.update_priorities(batch_idxes,
                                                       new_priorities)

                    actions_pred = []
                    # for j in range(ave_n):
                    for j in range(ave_n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(
                            actors[j].predict(state_batch_j)
                        )  # Should work till here, roughly, probably

                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    grads = critic.action_gradients(
                        s_batch_i,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    actor.train(s_batch_i, grads)

                action_dims_done = action_dims_done + actor.action_dim

            # Only DDPG agent

            for i in range(ave_n, env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args["minibatch_size"]):
                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory_ddpg.miniBatch(
                        int(args["minibatch_size"]))

                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    action = np.asarray(actor.predict_target(s_batch_i))

                    action_for_critic = np.asarray(
                        [x.flatten() for x in action])

                    s2_batch_i = np.asarray([x for x in s2_batch[:, i]])

                    # critic.predict_target(next state batch, actor_target(next state batch))
                    targetQ = critic.predict_target(s2_batch_i,
                                                    action_for_critic)

                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        # If ep is end
                        if d_batch[:, i][k]:
                            y_i.append(r_batch[:, i][k])
                        else:
                            y_i.append(r_batch[:, i][k] +
                                       critic.gamma * targetQ[k])
                    # state batch for agent i
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    critic.train(
                        s_batch_i,
                        np.asarray([x.flatten() for x in a_batch[:, i]]),
                        np.asarray(y_i))

                    action_for_critic_pred = actor.predict(s2_batch_i)

                    gradients = critic.action_gradients(
                        s_batch_i, action_for_critic_pred)[:, :]

                    actor.train(s_batch_i, gradients)

            for i in range(0, env.n):
                actor = actors[i]
                critic = critics[i]
                actor.update_target()
                critic.update_target()

            episode_reward += r
            #print(done)
            if stp == int(args["max_episode_len"]) - 1 or np.all(done):

                ave_reward = 0.0
                good_reward = 0.0
                for i in range(env.n):
                    if i < ave_n - 1:
                        ave_reward += episode_reward[i]
                    else:
                        good_reward += episode_reward[i]

                #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ave_reward,
                                           summary_vars[1]: good_reward
                                       })
                # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))})
                writer.add_summary(summary_str, ep)
                writer.flush()
                # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp))))
                showReward(episode_reward, env.n, ep, start)
                break

            #if stp == int(args['max_episode_len'])-1:
            #showReward(episode_reward, env.n, ep)

        # save model
        if ep % 50 == 0 and ep != 0:
            print("Starting saving model weights every 50 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, args["modelFolder"])
            print("Model weights saved")

        if ep % 200 == 0 and ep != 0:
            directory = args["modelFolder"] + "ep" + str(ep) + "/"
            if not os.path.exists(directory):
                os.makedirs(directory)
            print("Starting saving model weights to folder every 200 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, directory)
            print("Model weights saved to folder")
Пример #2
0
def train(sess, env, args, actors, critics, noise, ave_n):

    summary_ops, summary_vars = build_summaries(env.n)
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # callbacks = []
    # train_names = ['train_loss', 'train_mae']
    # callback = TensorBoard(args['summary_dir'])

    for actor in actors:
        actor.update_target()
    for critic in critics:
        critic.update_target()

    #for i in range(20):
    #		print([noise[i]()for i in range(env.n)])

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes'])):

        start = time.time()

        s = env.reset()
        episode_reward = np.zeros((env.n, ))
        #episode_av_max_q = 0

        for stp in range(int(args['max_episode_len'])):

            action_dims_done = 0

            if args['render_env']:
                env.render()

            a = []

            for i in range(env.n):
                actor = actors[i]
                state_input = np.reshape(s[i], (-1, actor.state_dim))
                a.append(
                    actor.act(state_input,
                              noise[i]()).reshape(actor.action_dim, ))
            # print(a)
            #time.sleep(10)
            s2, r, done, _ = env.step(
                a)  # a is a list with each element being an array
            #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,)))
            #if ep % 50 == 0:
            #	env.render()
            replayMemory.add(s, a, r, done, s2)
            s = s2
            # MADDPG Adversary Agent
            for i in range(ave_n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['m_size']):
                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args['m_size']))
                    a = []
                    for j in range(ave_n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  #batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))
                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
                    targetQ = critic.predict_target(s2_batch_i, a_for_critic)
                    yi = []
                    for k in range(int(args['m_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    # a2 = actor.predict_target(s_batch)
                    # Q_target = critic.predict_target(s2_batch, a2)
                    # y = r + gamma * Q_target
                    # TD loss = yi - critic.predict(s_batch, a_batch)
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    a_batch_data = np.asarray(
                        [x.flatten() for x in a_batch[:, 0:ave_n, :]])
                    target_q = np.asarray(yi)
                    # loss = batch
                    losses = []
                    # clip
                    index = 0
                    # number of losses
                    loss_num = int(int(args['m_size']) / int(args['n_size']))
                    for i in range(loss_num):
                        loss = critic.get_loss(
                            s_batch_i[index:index + int(args["n_size"])],
                            a_batch_data[index:index + int(args["n_size"])],
                            target_q[index:index + int(args["n_size"])])
                        losses.append(loss)
                        index += int(args["n_size"])
                    # which has max loss
                    sorted_index = np.argsort(losses).tolist()
                    max_index = sorted_index[-1]
                    # clip index
                    head = max_index * int(args["n_size"])
                    tail = head + int(args["n_size"])
                    # clipped batch data with higher losses
                    prioritized_a_batch = a_batch_data[head:tail]
                    prioritized_s_batch = s_batch_i[head:tail]
                    prioritized_target_q = target_q[head:tail]
                    # critic train
                    critic.train(prioritized_s_batch, prioritized_a_batch,
                                 prioritized_target_q)
                    actions_pred = []
                    # for j in range(ave_n):
                    for j in range(ave_n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(actors[j].predict(
                            state_batch_j[head:tail]))
                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    grads = critic.action_gradients(
                        prioritized_s_batch,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    # actor train
                    actor.train(prioritized_s_batch, grads)
                action_dims_done = action_dims_done + actor.action_dim
            # Only DDPG agent
            for i in range(ave_n, env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args["minibatch_size"]):
                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args["minibatch_size"]))
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    action = np.asarray(actor.predict_target(s_batch_i))
                    action_for_critic = np.asarray(
                        [x.flatten() for x in action])
                    s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
                    targetQ = critic.predict_target(s2_batch_i,
                                                    action_for_critic)
                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            y_i.append(r_batch[:, i][k])
                        else:
                            y_i.append(r_batch[:, i][k] +
                                       critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    critic.train(
                        s_batch_i,
                        np.asarray([x.flatten() for x in a_batch[:, i]]),
                        np.asarray(y_i))
                    action_for_critic_pred = actor.predict(s2_batch_i)
                    gradients = critic.action_gradients(
                        s_batch_i, action_for_critic_pred)[:, :]
                    actor.train(s_batch_i, gradients)
            for i in range(0, env.n):
                actor = actors[i]
                critic = critics[i]
                actor.update_target()
                critic.update_target()

            episode_reward += r
            #print(done)
            if stp == int(args["max_episode_len"]) - 1 or np.all(done):

                ave_reward = 0.0
                good_reward = 0.0
                for i in range(env.n):
                    if i < ave_n:
                        ave_reward += episode_reward[i]
                    else:
                        good_reward += episode_reward[i]

                #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ave_reward,
                                           summary_vars[1]: good_reward
                                       })
                # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))})
                writer.add_summary(summary_str, ep)
                writer.flush()
                # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp))))
                showReward(episode_reward, env.n, ep, start)
                break

            #if stp == int(args['max_episode_len'])-1:
            #showReward(episode_reward, env.n, ep)

        # save model
        if ep % 50 == 0 and ep != 0:
            print("Starting saving model weights every 50 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, args["modelFolder"])
            print("Model weights saved")

        if ep % 200 == 0 and ep != 0:
            directory = args["modelFolder"] + "ep" + str(ep) + "/"
            if not os.path.exists(directory):
                os.makedirs(directory)
            print("Starting saving model weights to folder every 200 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, directory)
            print("Model weights saved to folder")
Пример #3
0
def train(sess, env, args, actors, critics, noise):

    summary_ops, summary_vars = build_summaries(env.n)
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    for actor in actors:
        actor.update_target()
    for critic in critics:
        critic.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes'])):

        s = env.reset()
        episode_reward = np.zeros((env.n, ))
        #episode_av_max_q = 0

        for stp in range(int(args['max_episode_len'])):
            if args['render_env']:
                env.render()

            a = []
            action_dims_done = 0

            for i in range(env.n):
                actor = actors[i]
                a.append(
                    actor.act(np.reshape(s[i], (-1, actor.state_dim)),
                              noise[i]()).reshape(actor.action_dim, ))

            s2, r, done, _ = env.step(
                a)  # a is a list with each element being an array
            #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,)))
            replayMemory.add(s, a, r, done, s2)
            s = s2

            for i in range(env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['minibatch_size']):

                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args['minibatch_size']))
                    a = []
                    for j in range(env.n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  #batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))

                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([
                        x for x in s2_batch[:, i]
                    ])  # Checked till this point, should be fine.
                    targetQ = critic.predict_target(
                        s2_batch_i, a_for_critic)  # Should  work, probably

                    yi = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    critic.train(s_batch_i,
                                 np.asarray([x.flatten() for x in a_batch]),
                                 np.asarray(yi))
                    #predictedQValue = critic.train(s_batch,np.asarray([x.flatten() for x in a_batch]),yi)
                    #episode_av_max_q += np.amax(predictedQValue)

                    actions_pred = []
                    for j in range(env.n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(
                            actors[j].predict(state_batch_j)
                        )  # Should work till here, roughly, probably

                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    grads = critic.action_gradients(
                        s_batch_i,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    actor.train(s_batch_i, grads)
                    #print("Training agent {}".format(i))
                    actor.update_target()
                    critic.update_target()

            action_dims_done = action_dims_done + actor.action_dim
            episode_reward += r
            if np.all(done):
                #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
                summary_str = sess.run(
                    summary_ops,
                    feed_dict={summary_vars[0]: np.sum(episode_reward)})
                writer.action_dims_donesummary(summary_str, ep)
                writer.flush()
                #print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp))))
                print('|Reward: {:d},{:d},{:d},{:d}	| Episode: {:d}'.format(
                    int(episode_reward[0]), int(episode_reward[1]),
                    int(episode_reward[2]), int(episode_reward[3]), ep))
                break
Пример #4
0
def distributed_train(sess, env, args, actors, critics, noise, ave_n):
    """
    1. replay memory
        - for each timestep
        2. async batch data 
        3. 
    """
    summary_ops, summary_vars = build_summaries(env.n)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)
    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    start_time = 0.0
    end_time = 0.0

    for ep in range(int(args['max_episodes'])):
        # collecting reward
        s = env.reset()
        episode_reward = np.zeros((env.n, ))

        start = time.time()

        for step in range(int(args['max_episode_len'])):
            action_dims_done = 0
            a = []
            for i in range(env.n):
                actor = actors[i]
                state_input = np.reshape(s[i], (-1, actor.state_dim))
                a.append(
                    actor.act(state_input,
                              noise[i]()).reshape(actor.action_dim, ))
            s2, r, done, _ = env.step(
                a)  # a is a list with each element being an array
            episode_reward += r
            if replayMemory.size() > int(args["minibatch_size"]):
                # MADDPG Adversary Agent
                for i in range(ave_n):
                    actor = actors[i]
                    critic = critics[i]
                    if replayMemory.size() > int(args['m_size']):
                        s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                            int(args['m_size']))
                        a = []
                        for j in range(ave_n):
                            state_batch_j = np.asarray(
                                [x for x in s_batch[:, j]]
                            )  #batch processing will be much more efficient even though reshaping will have to be done
                            a.append(actors[j].predict_target(state_batch_j))
                        a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                        a_for_critic = np.asarray(
                            [x.flatten() for x in a_temp])
                        s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
                        targetQ = critic.predict_target(
                            s2_batch_i, a_for_critic)
                        yi = []
                        for k in range(int(args['m_size'])):
                            if d_batch[:, i][k]:
                                yi.append(r_batch[:, i][k])
                            else:
                                yi.append(r_batch[:, i][k] +
                                          critic.gamma * targetQ[k])
                        # a2 = actor.predict_target(s_batch)
                        # Q_target = critic.predict_target(s2_batch, a2)
                        # y = r + gamma * Q_target
                        # TD loss = yi - critic.predict(s_batch, a_batch)
                        s_batch_i = np.asarray([x for x in s_batch[:, i]])
                        a_batch_data = np.asarray(
                            [x.flatten() for x in a_batch[:, 0:ave_n, :]])
                        target_q = np.asarray(yi)
                        #############################################
                        ##   prioritized_batch
                        #############################################
                        # loss = batch
                        losses = []
                        # clip
                        index = 0
                        # number of losses
                        loss_num = int(
                            int(args['m_size']) / int(args['n_size']))
                        for i in range(loss_num):
                            loss = critic.get_loss(
                                s_batch_i[index:index + int(args["n_size"])],
                                a_batch_data[index:index +
                                             int(args["n_size"])],
                                target_q[index:index + int(args["n_size"])])
                            losses.append(loss)
                            index += int(args["n_size"])
                        # which has max loss
                        sorted_index = np.argsort(losses).tolist()
                        max_index = sorted_index[-1]
                        # clip index
                        head = max_index * int(args["n_size"])
                        tail = head + int(args["n_size"])
                        # clipped batch data with higher losses
                        prioritized_a_batch = a_batch_data[head:tail]
                        prioritized_s_batch = s_batch_i[head:tail]
                        prioritized_target_q = target_q[head:tail]
                        #############################################
                        ##   prioritized_batch
                        #############################################
                        # critic train
                        critic.train(prioritized_s_batch, prioritized_a_batch,
                                     prioritized_target_q)
                        actions_pred = []
                        # for j in range(ave_n):
                        for j in range(ave_n):
                            state_batch_j = np.asarray(
                                [x for x in s2_batch[:, j]])
                            actions_pred.append(actors[j].predict(
                                state_batch_j[head:tail]))
                        a_temp = np.transpose(np.asarray(actions_pred),
                                              (1, 0, 2))
                        a_for_critic_pred = np.asarray(
                            [x.flatten() for x in a_temp])
                        grads = critic.action_gradients(
                            prioritized_s_batch, a_for_critic_pred
                        )[:,
                          action_dims_done:action_dims_done + actor.action_dim]
                        # actor train
                        actor.train(prioritized_s_batch, grads)
                    action_dims_done = action_dims_done + actor.action_dim
                # Only DDPG agent
                for i in range(ave_n, env.n):
                    actor = actors[i]
                    critic = critics[i]
                    if replayMemory.size() > int(args["minibatch_size"]):
                        s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                            int(args["minibatch_size"]))
                        s_batch_i = np.asarray([x for x in s_batch[:, i]])
                        action = np.asarray(actor.predict_target(s_batch_i))
                        action_for_critic = np.asarray(
                            [x.flatten() for x in action])
                        s2_batch_i = np.asarray([x for x in s2_batch[:, i]])
                        targetQ = critic.predict_target(
                            s2_batch_i, action_for_critic)
                        y_i = []
                        for k in range(int(args['minibatch_size'])):
                            if d_batch[:, i][k]:
                                y_i.append(r_batch[:, i][k])
                            else:
                                y_i.append(r_batch[:, i][k] +
                                           critic.gamma * targetQ[k])
                        s_batch_i = np.asarray([x for x in s_batch[:, i]])
                        critic.train(
                            s_batch_i,
                            np.asarray([x.flatten() for x in a_batch[:, i]]),
                            np.asarray(y_i))
                        action_for_critic_pred = actor.predict(s2_batch_i)
                        gradients = critic.action_gradients(
                            s_batch_i, action_for_critic_pred)[:, :]
                        actor.train(s_batch_i, gradients)
                for i in range(0, env.n):
                    actor = actors[i]
                    critic = critics[i]
                    actor.update_target()
                    critic.update_target()

            if step == int(args["max_episode_len"]) - 1 or np.all(done):
                #############################################
                ##   Record reward data into tensorboard
                #############################################
                ave_reward = 0.0
                good_reward = 0.0
                for i in range(env.n):
                    if i < ave_n:
                        ave_reward += episode_reward[i]
                    else:
                        good_reward += episode_reward[i]
                #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ave_reward,
                                           summary_vars[1]: good_reward
                                       })
                # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))})
                writer.add_summary(summary_str, ep)
                writer.flush()
                showReward(episode_reward, env.n, ep, start)
                break

        if ep % 50 == 0 and ep != 0:
            print("Starting saving model weights every 50 episodes")
            for i in range(env.n):
                saveWeights(actors[i], i, args["modelFolder"])
            print("Model weights saved")
        if ep % 100 == 0 and ep != 0:
            directory = args["modelFolder"] + "ep" + str(ep) + "/"
            if not os.path.exists(directory):
                os.makedirs(directory)
            print("Starting saving model weights to folder every 100 episodes")
            for i in range(env.n):
                saveWeights(actors[i], i, directory)
            print("Model weights saved to folder")

        # recieve batch data from workers
        batch_data = [comm.recv(source=i, tag=i) for i in range(1, size)]
        for batch in batch_data:
            for item in batch:
                (s, a, r, d, s2) = item
            replayMemory.add(s, a, r, d, s2)
        # send weights to workers
        actor_weights = [actor.mainModel.get_weights() for actor in actors]
        for i in range(1, size):
            comm.send(actor_weights, dest=i, tag=i)
Пример #5
0
print('Training for ' + str(frames) + ' frames.')
print('Batch size = ', batch_size)
print('Initial memory size = ', len(memory.data))
print('Update Q target frequency = ', update_frequency)
print('Evaluation frequency = ', evaluation_frequency)

n = 0
j = 0

while n in range(frames):

    done = False
    initial_state = env.reset()
    action = agent.getAction(LazyFrame2Torch(initial_state))
    state, reward, done, _ = env.step(action)
    memory.add(initial_state, action, reward, state, done)
    agent.decrease_epsilon()
    n += 1  #5367

    while (not done):

        action = agent.getAction(LazyFrame2Torch(state))
        next_state, reward, done, _ = env.step(action)
        memory.add(state, action, reward, next_state, done)
        state = next_state
        agent.decrease_epsilon()
        n += 1  #5368

        if memory.current_size >= batch_size:

            # get batch of size 32 from replay memory
Пример #6
0
def train(sess,env,args,actors,critics,noise, ave_n):

	summary_ops,summary_vars = build_summaries(env.n)
	init = tf.global_variables_initializer()
	sess.run(init)
	writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)
	
	# callbacks = []
	# train_names = ['train_loss', 'train_mae']
	# callback = TensorBoard(args['summary_dir'])

	for actor in actors:
		actor.update_target()
	for critic in critics:
		# callback = TensorBoard(args['summary_dir'])
		# callback.set_model(critic.mainModel)
		# callbacks.append(callback)

		critic.update_target()
	
	replayMemory = ReplayMemory(int(args['buffer_size']),int(args['random_seed']))

	for ep in range(int(args['max_episodes'])):

		start = time.time()

		s = env.reset()
		episode_reward = np.zeros((env.n,))
		#episode_av_max_q = 0

		for stp in range(int(args['max_episode_len'])):

			losses = []
			# action_dims_done = 0

			if args['render_env']:
				env.render()
			
			a = []

			for i in range(env.n):
				actor = actors[i]
				state_input = np.reshape(s[i],(-1,actor.state_dim))
				a.append(actor.act(state_input, noise[i]()).reshape(actor.action_dim,))
						
			s2,r,done,_ = env.step(a) # a is a list with each element being an array
			#replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,)))
			replayMemory.add(s,a,r,done,s2)
			s = s2

			
			# Only DDPG agent
			
			for i in range(env.n):
				actor = actors[i]
				critic = critics[i]
				if replayMemory.size() > int(args["minibatch_size"]):
					s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(int(args["minibatch_size"]))
					
					# action for critic					
					s_batch_i = np.asarray([x for x in s_batch[:,i]])

					action = np.asarray(actor.predict_target(s_batch_i))

					action_for_critic = np.asarray([x.flatten() for x in action])

					s2_batch_i = np.asarray([x for x in s2_batch[:, i]])

					# critic.predict_target(next state batch, actor_target(next state batch))
					targetQ = critic.predict_target(s2_batch_i, action_for_critic)

					y_i = []
					for k in range(int(args['minibatch_size'])):
						# If ep is end
						if d_batch[:, i][k]:
							y_i.append(r_batch[:, i][k])
						else:
							y_i.append(r_batch[:, i][k] + critic.gamma * targetQ[k])
					# state batch for agent i
					s_batch_i= np.asarray([x for x in s_batch[:, i]])

					critic.train(s_batch_i, np.asarray([x.flatten() for x in a_batch[:, i]]), np.asarray(y_i))

					action_for_critic_pred = actor.predict(s2_batch_i)

					gradients = critic.action_gradients(s_batch_i, action_for_critic_pred)[:, :]

					actor.train(s_batch_i, gradients)
			
			for i in range(0, env.n):
				actor = actors[i]
				critic = critics[i]
				actor.update_target()
				critic.update_target()
			
			episode_reward += r
			#print(done)
			if stp == int(args["max_episode_len"])-1 or np.all(done) :
				
				ave_reward = 0.0
				good_reward = 0.0
				"""
				for i in range(env.n):
					if i < ave_n:
						ave_reward += episode_reward[i]
					else:
						good_reward += episode_reward[i]
				"""
				ave_reward = episode_reward[0]
				good_reward = episode_reward[2]	
				#summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
				summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: ave_reward, summary_vars[1]: good_reward})
				# summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))})
				writer.add_summary(summary_str, ep)
				writer.flush()
				# print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp))))
				showReward(episode_reward, env.n, ep, start)
				break

			#if stp == int(args['max_episode_len'])-1:
				#showReward(episode_reward, env.n, ep)

		# save model
		if ep % 50 == 0 and ep != 0:
			print("Starting saving model weights every 50 episodes")
			for i in range(env.n):
				# saveModel(actors[i], i, args["modelFolder"])
				saveWeights(actors[i], i, args["modelFolder"])
			print("Model weights saved")

		if ep % 200 == 0 and ep != 0:
			directory = args["modelFolder"] + "ep" + str(ep) + "/"
			if not os.path.exists(directory):
				os.makedirs(directory)
			print("Starting saving model weights to folder every 200 episodes")
			for i in range(env.n):
				# saveModel(actors[i], i, args["modelFolder"])
				saveWeights(actors[i], i, directory)
			print("Model weights saved to folder")
Пример #7
0
def train_ddpg(env,
               num_steps,
               replay_size,
               batch_size,
               gamma,
               noise,
               num_saves=5,
               replay_prepopulate_steps=0,
               lr_critic=1e-2,
               lr_actor=1e-4,
               tau=0.001,
               max_action=1.0,
               min_action=-1.0,
               rand_face=False):
    # get the state_size from the environment
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    # initialize the Critic and target Critic models
    critic_model = Critic(state_size, action_size).to(device)
    critic_target = copy.deepcopy(critic_model)

    # initialize the Actor and target Actor models
    actor_model = Actor(state_size, action_size, 1.0).to(device)
    actor_target = copy.deepcopy(actor_model)

    # initialize the optimizer
    critic_optimizer = torch.optim.Adam(critic_model.parameters(),
                                        weight_decay=lr_critic)
    actor_optimizer = torch.optim.Adam(actor_model.parameters(), lr=lr_actor)

    # initialize the replay memory and prepopulate it
    memory = ReplayMemory(replay_size, state_size, action_size)
    memory.populate(env, replay_prepopulate_steps, rand_face)

    # initiate lists to store returns, lengths and losses
    returns = []
    lengths = []
    losses = []

    # initiate structures to store the models at different stages of training
    t_saves = np.linspace(0, num_steps, num_saves - 1, endpoint=False)
    saved_models = {}
    saved_policies = {}

    i_episode = 0  # use this to indicate the index of the current episode
    t_episode = 0  # use this to indicate the time-step inside current episode
    G = 0  # initializing return variable (incremental tally during each episode)

    state = env.reset(rand_face)  # initialize state of first episode

    # iterate for a total of `num_steps` steps
    pbar = tqdm.trange(num_steps)
    for t_total in pbar:
        # use t_total to indicate the time-step from the beginning of training
        # save model
        if t_total in t_saves:
            model_name = '%04.1f' % (100 * t_total / num_steps)
            model_name = model_name.replace('.', '_')
            saved_models[model_name] = copy.deepcopy(critic_model)
            saved_policies[model_name] = copy.deepcopy(actor_model)

        action_arr = (actor_model(
            torch.FloatTensor(state).to(device)).cpu().data.numpy()) + noise()
        action_arr = (((action_arr + 1.0) / 2.0) *
                      (max_action - min_action)) + min_action

        action = np.clip(action_arr, a_min=min_action, a_max=max_action)

        ss, r, done, info = env.step(action)
        memory.add(state=state,
                   action=action,
                   reward=r,
                   next_state=ss,
                   done=done)

        batch = memory.sample(batch_size)
        loss = train_ddpg_batch(critic_optimizer, actor_optimizer, batch,
                                critic_model, critic_target, actor_model,
                                actor_target, gamma, tau)
        losses.append(loss)

        if done:
            # When episode is done, collect return, update counters, and reset env
            G += (gamma**t_episode) * r
            returns.append(G)

            pbar.set_description('Episode: %d | Steps: %d | Return: %5.2f' %
                                 (i_episode, t_episode + 1, G))

            lengths.append(t_episode + 1)
            t_episode = 0
            i_episode += 1
            G = 0
            state = env.reset(rand_face)

        else:
            # While episode is not done, move state pointer forward and update return
            state = ss
            G += (gamma**t_episode) * r
            t_episode += 1

    saved_models['100_0'] = copy.deepcopy(critic_model)
    saved_policies['100_0'] = copy.deepcopy(actor_model)

    return (
        saved_models,
        saved_policies,
        np.array(returns),
        np.array(lengths),
        np.array(losses),
    )
Пример #8
0
        terminal = 0
        reward = 0
        if done:
            terminal = 1
            if not step >= 195:
                reward = -1
        sum_reward += reward

        obs_queue.put(obs)
        reward_queue.put(reward)
        action_queue.put(action)
        step_reward = step_reward / gamma + reward * (gamma**N_STEP)

        if step >= N_STEP - 1:
            memory.add(obs_queue.get(), action_queue.get(), step_reward,
                       next_obs, terminal)
            step_reward -= reward_queue.get()

        if done:
            while not action_queue.empty():
                step_reward = step_reward / gamma
                memory.add(obs_queue.get(), action_queue.get(), step_reward,
                           next_obs, terminal)
                step_reward -= reward_queue.get()

        obs = next_obs.copy()

        step += 1
        total_step += 1
        if total_step < initial_exploration:
            continue
Пример #9
0
    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:
        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.act(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward
        ##############################train######################
        if replayMemory.size() >= 128:
            state_b, action_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniBatch(
                int(64))
            next_state_b_value = actor.predict(next_state_b)
            state_b_value = actor.predict(state_b)
            length = state_b.shape[0]

            for i in range(length):
                target_next = reward_b[i]
                if not done_b[i]:
Пример #10
0
        step_reward = step_reward / gamma + reward * (gamma**N_STEP)

        if step >= N_STEP - 1:
            with torch.no_grad():
                max_next_q_value_index = qf(torch.Tensor([next_obs])).max(
                    dim=1, keepdim=True)[1].numpy().squeeze()
                max_next_q_value = target_qf(torch.Tensor(
                    [next_obs]))[0][max_next_q_value_index].numpy()
                current_state = obs_queue.get()
                current_action = action_queue.get()
                q_value = qf(torch.Tensor([current_state
                                           ]))[0][current_action].numpy()
                td_error = abs(step_reward + max_next_q_value *
                               (gamma**N_STEP) - q_value)
                priority = td_error
                memory.add(current_state, current_action, step_reward,
                           next_obs, priority, terminal)
                step_reward -= reward_queue.get()

        if done:
            while not action_queue.empty():
                with torch.no_grad():
                    step_reward = step_reward / gamma
                    max_next_q_value_index = qf(torch.Tensor([next_obs])).max(
                        dim=1, keepdim=True)[1].numpy().squeeze()
                    max_next_q_value = target_qf(torch.Tensor(
                        [next_obs]))[0][max_next_q_value_index].numpy()
                    current_state = obs_queue.get()
                    current_action = action_queue.get()
                    q_value = qf(torch.Tensor([current_state
                                               ]))[0][current_action].numpy()
                    td_error = abs(step_reward + max_next_q_value *
Пример #11
0
def train(sess, env, args, actors, critics, noise, ave_n):

    summary_ops, summary_vars = build_summaries(env.n)
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # callbacks = []
    # train_names = ['train_loss', 'train_mae']
    # callback = TensorBoard(args['summary_dir'])

    for actor in actors:
        actor.update_target()
    for critic in critics:
        # callback = TensorBoard(args['summary_dir'])
        # callback.set_model(critic.mainModel)
        # callbacks.append(callback)

        critic.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for ep in range(int(args['max_episodes'])):

        start = time.time()

        s = env.reset()
        episode_reward = np.zeros((env.n, ))
        #episode_av_max_q = 0

        for stp in range(int(args['max_episode_len'])):

            losses = []
            action_dims_done = 0

            if args['render_env']:
                env.render()

            a = []

            for i in range(env.n):
                actor = actors[i]
                state_input = np.reshape(s[i], (-1, actor.state_dim))
                a.append(
                    actor.act(state_input,
                              noise[i]()).reshape(actor.action_dim, ))

            s2, r, done, _ = env.step(
                a)  # a is a list with each element being an array
            #replayMemory.add(np.reshape(s,(actor.input_dim,)),np.reshape(a,(actor.output_dim,)),r,done,np.reshape(s2,(actor.input_dim,)))
            replayMemory.add(s, a, r, done, s2)
            s = s2

            # MADDPG Adversary Agent
            for i in range(ave_n):

                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args['minibatch_size']):

                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args['minibatch_size']))
                    a = []
                    for j in range(ave_n):
                        state_batch_j = np.asarray(
                            [x for x in s_batch[:, j]]
                        )  #batch processing will be much more efficient even though reshaping will have to be done
                        a.append(actors[j].predict_target(state_batch_j))
                    #print(np.asarray(a).shape)
                    a_temp = np.transpose(np.asarray(a), (1, 0, 2))
                    #print("a_for_critic", a_temp.shape)
                    a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    s2_batch_i = np.asarray([
                        x for x in s2_batch[:, i]
                    ])  # Checked till this point, should be fine.
                    # print("s2_batch_i", s2_batch_i.shape)
                    targetQ = critic.predict_target(
                        s2_batch_i, a_for_critic)  # Should  work, probably

                    yi = []
                    for k in range(int(args['minibatch_size'])):
                        if d_batch[:, i][k]:
                            yi.append(r_batch[:, i][k])
                        else:
                            yi.append(r_batch[:, i][k] +
                                      critic.gamma * targetQ[k])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    # critic.train()
                    #critic.train(s_batch_i,np.asarray([x.flatten() for x in a_batch]),np.asarray(yi))
                    loss = critic.train(
                        s_batch_i,
                        np.asarray(
                            [x.flatten() for x in a_batch[:, 0:ave_n, :]]),
                        np.asarray(yi))

                    losses.append(loss)

                    # callback.set_model(critic.mainModel)

                    # write_log(callback, train_names, logs, ep)
                    #predictedQValue = critic.train(s_batch,np.asarray([x.flatten() for x in a_batch]),yi)
                    #episode_av_max_q += np.amax(predictedQValue)

                    actions_pred = []
                    # for j in range(ave_n):
                    for j in range(ave_n):
                        state_batch_j = np.asarray([x for x in s2_batch[:, j]])
                        actions_pred.append(
                            actors[j].predict(state_batch_j)
                        )  # Should work till here, roughly, probably

                    a_temp = np.transpose(np.asarray(actions_pred), (1, 0, 2))
                    a_for_critic_pred = np.asarray(
                        [x.flatten() for x in a_temp])
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])
                    grads = critic.action_gradients(
                        s_batch_i,
                        a_for_critic_pred)[:,
                                           action_dims_done:action_dims_done +
                                           actor.action_dim]
                    actor.train(s_batch_i, grads)
                    #print("Training agent {}".format(i))
                    #actor.update_target()
                    #critic.update_target()

                action_dims_done = action_dims_done + actor.action_dim

            # Only DDPG agent

            for i in range(ave_n, env.n):
                actor = actors[i]
                critic = critics[i]
                if replayMemory.size() > int(args["minibatch_size"]):
                    s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                        int(args["minibatch_size"]))

                    # action for critic
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    action = np.asarray(actor.predict_target(s_batch_i))
                    #print("action", action.shape)

                    # a_temp = np.transpose(np.asarray(a),(1,0,2))
                    # a_for_critic = np.asarray([x.flatten() for x in a_temp])
                    # for j in range(env.n):
                    #    print(np.asarray([x for x in s_batch[:,j]]).shape)

                    action_for_critic = np.asarray(
                        [x.flatten() for x in action])

                    s2_batch_i = np.asarray([x for x in s2_batch[:, i]])

                    # critic.predict_target(next state batch, actor_target(next state batch))
                    targetQ = critic.predict_target(s2_batch_i,
                                                    action_for_critic)

                    #print("length: ", len(targetQ))
                    #print(targetQ)

                    #time.sleep(10)
                    # loss = meanSquare(y - Critic(batch state, batch action)
                    # y = batch_r + gamma * targetQ
                    y_i = []
                    for k in range(int(args['minibatch_size'])):
                        # If ep is end
                        if d_batch[:, i][k]:
                            y_i.append(r_batch[:, i][k])
                        else:
                            y_i.append(r_batch[:, i][k] +
                                       critic.gamma * targetQ[k])
                    # state batch for agent i
                    s_batch_i = np.asarray([x for x in s_batch[:, i]])

                    loss = critic.train(
                        s_batch_i,
                        np.asarray([x.flatten() for x in a_batch[:, i]]),
                        np.asarray(y_i))

                    losses.append(loss)
                    # callback.set_model(critic.mainModel)

                    # write_log(callback, train_names, logs, ep)

                    action_for_critic_pred = actor.predict(s2_batch_i)

                    gradients = critic.action_gradients(
                        s_batch_i, action_for_critic_pred)[:, :]

                    # check gradients
                    """
					grad_check = tf.check_numerics(gradients, "something wrong with gradients")

					with tf.control_dependencies([grad_check]):
  						
						actor.train(s_batch_i, gradients)
					"""

                    actor.train(s_batch_i, gradients)

                    # actor.update_target()

                    # critic.update_target()

            for i in range(0, env.n):
                actor = actors[i]
                critic = critics[i]
                actor.update_target()
                critic.update_target()

            episode_reward += r
            #print(done)
            if stp == int(args["max_episode_len"]) - 1 or np.all(done):
                """
				ave_reward = 0.0
				good_reward = 0.0
				for i in range(env.n):
					if i < ave_n - 1:
						ave_reward += episode_reward[i]
					else:
						good_reward += episode_reward[i]
				"""
                # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: episode_reward, summary_vars[1]: episode_av_max_q/float(stp)})
                # summary_str = sess.run(summary_ops, feed_dict = {summary_vars[0]: ave_reward, summary_vars[1]: good_reward})
                #summary_str = sess.run(summary_ops, feed_dict = {summary_vars[i]: losses[i] for i in range(len(losses))})
                #writer.add_summary(summary_str, ep)
                #writer.flush()
                # print ('|Reward: {:d}| Episode: {:d}| Qmax: {:.4f}'.format(int(episode_reward),ep,(episode_av_max_q/float(stp))))
                showReward(episode_reward, env.n, ep, start)
                break

            #if stp == int(args['max_episode_len'])-1:
            #showReward(episode_reward, env.n, ep)

        # save model
        if ep % 50 == 0 and ep != 0:
            print("Starting saving model weights every 50 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, args["modelFolder"])
            print("Model weights saved")

        if ep % 200 == 0 and ep != 0:
            directory = args["modelFolder"] + "ep" + str(ep) + "/"
            if not os.path.exists(directory):
                os.makedirs(directory)
            print("Starting saving model weights to folder every 200 episodes")
            for i in range(env.n):
                # saveModel(actors[i], i, args["modelFolder"])
                saveWeights(actors[i], i, directory)
            print("Model weights saved to folder")
Пример #12
0
def train(sess, env, actor, critic, actor_noise):
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./testsummaries/', sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()
    # Initialize replay memory
    replay_buffer = ReplayMemory(10000)

    episodes = 200000
    for i in range(episodes):
        s = env.reset()

        ep_reward = 0
        for j in range(1):

            # Added exploration noise
            #a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            #            print('state ', s)
            #            print('prediction ', actor.predict(s.reshape((1,5,1))))
            a = actor.predict(s.reshape((1, 5, 1)))  #+ actor_noise()
            #            print('action ', a)
            #            print('noise', actor_noise())
            s2, r, terminal = env.step(a)

            replay_buffer.add(s, a.reshape((env.players, 1)), r, s2, terminal)

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            #            print('buffer_size', replay_buffer.size())
            if replay_buffer.count() > 32:
                batch = replay_buffer.getBatch(32)

                states = np.asarray([seq[0] for seq in batch])
                actions = np.asarray([seq[1] for seq in batch])
                rewards = np.asarray([seq[2] for seq in batch])
                new_states = np.asarray([seq[3] for seq in batch])
                dones = np.asarray([seq[4] for seq in batch])
                y_t = rewards.copy()

                #Compute the target values
                target_q_values = critic.target_model.predict(
                    [new_states,
                     actor.target_model.predict(new_states)])
                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        gamma = 0.98
                        # print(rewards[k].shape, target_q_values[k].shape, y_t[k].shape)
                        y_t[k] = rewards[k] + gamma * target_q_values[k]

                if (1):
                    # self.loss += self.critic.model.train_on_batch([states, actions], y_t)
                    actions_for_grad = actor.model.predict(states)
                    grads = critic.gradients(states, actions_for_grad)
                    #                    print('shapes' ,states.shape, actions.shape, y_t.shape )
                    critic.train(states, actions, y_t)
                    actor.train(states, grads)
                    actor.update_target_network()
                    critic.update_target_network()
                    """
                states = [np.expand_dims(seq[0], axis=0) for seq in batch]
                actions = [np.expand_dims(seq[1], axis=0) for seq in batch]
                rewards = [seq[2] for seq in batch]
                new_states = [np.expand_dims(seq[3], axis=0) for seq in batch]
                dones = [seq[4] for seq in batch]




                target_q_values = critic.predict_target_separate(new_states, actor.predict_target_separate(new_states))
                y_t = deepcopy(target_q_values)
                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k].reshape((rewards[k].shape[0], 1))
                    else:
                        gamma = 0.98
                        y_t[k] = rewards[k].reshape((rewards[k].shape[0], 1)) + gamma * target_q_values[k]


                actions_for_grads = actor.predict_separate(states)
                grads = critic.gradients_separate(states, actions_for_grads)


                actor.train_separate(states, grads)
                critic.train_separate(states, actions, y_t)

                actor.update_target_network()
                critic.update_target_network()

"""

            ep_reward += r

            #            if terminal:
            #                summary_str = sess.run(summary_ops, feed_dict={
            #                    summary_vars[0]: np.sum(ep_reward),
            #                })
            #                writer.add_summary(summary_str, i)
            #                writer.flush()
            #                break
            if i % 100 == 0:
                print("episode {} reward: {}".format(i, np.sum(ep_reward)))
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: np.sum(ep_reward),
                                       })
                writer.add_summary(summary_str, i)
                writer.flush()
                break
Пример #13
0
class Actor:
    def __init__(self, path, model_path, target_model_path, actor_index):
        self.path = path
        self.model_path = model_path
        self.target_model_path = target_model_path
        self.actor_index = actor_index
        self.lr = 1e-3
        self.gamma = 0.95
        self.epsilon = 0.3
        self.batch_size = 32
        self.initial_exploration = 500
        self.N_STEP = 3
        self.step_reward = 0
        self.qf = DuelingQFunc()
        self.target_qf = DuelingQFunc()
        #model.state_dict():モデルの学習パラメータをとってきている
        self.target_qf.load_state_dict(self.qf.state_dict())

        self.optimizer = optim.Adam(self.qf.parameters(), lr=self.lr)
        self.criterion = nn.MSELoss()
        self.env = gym.make('CartPole-v0')
        self.obs_size = self.env.observation_space.shape[0]
        self.action_size = self.env.action_space.n
        self.obs_queue = queue.Queue()
        self.reward_queue = queue.Queue()
        self.action_queue = queue.Queue()
        self.total_step = 0
        self.ten_step = 0
        self.temporal_memory = ReplayMemory()

    def run(self):
        for episode in range(1000):
            done = False
            obs = self.env.reset()
            sum_reward = 0
            step = 0
            self.step_reward = 0
            self.obs_queue = queue.Queue()
            self.reward_queue = queue.Queue()
            self.action_queue = queue.Queue()

            while not done:
                if random.random() < self.epsilon:
                    action = self.env.action_space.sample()
                else:
                    action = self.qf.select_action(obs)
                self.epsilon -= 1e-4
                if self.epsilon < 0:
                    self.epsilon = 0

                next_obs, reward, done, _ = self.env.step(action)
                terminal = 0
                reward = 0
                if done:
                    terminal = 1
                    if not step >= 195:
                        reward = -1
                sum_reward += reward

                self.obs_queue.put(obs)
                self.reward_queue.put(reward)
                self.action_queue.put(action)
                self.step_reward = self.step_reward / self.gamma + reward * (
                    self.gamma**self.N_STEP)
                if step >= self.N_STEP - 1:
                    with torch.no_grad():
                        max_next_q_value_index = self.qf(
                            torch.Tensor([next_obs])).max(
                                dim=1, keepdim=True)[1].numpy().squeeze()
                        max_next_q_value = self.target_qf(
                            torch.Tensor([
                                next_obs
                            ]))[0][max_next_q_value_index].numpy()
                        current_state = self.obs_queue.get()
                        current_action = self.action_queue.get()
                        q_value = self.qf(torch.Tensor(
                            [current_state]))[0][current_action].numpy()
                        td_error = abs(self.step_reward + max_next_q_value *
                                       (self.gamma**self.N_STEP) - q_value)
                        priority = td_error
                        self.temporal_memory.add(current_state, current_action,
                                                 self.step_reward, next_obs,
                                                 priority, terminal)
                        self.step_reward -= self.reward_queue.get()
                if done:
                    while not self.action_queue.empty():
                        with torch.no_grad():
                            self.step_reward = self.step_reward / self.gamma
                            max_next_q_value_index = self.qf(
                                torch.Tensor([next_obs])).max(
                                    dim=1, keepdim=True)[1].numpy().squeeze()
                            max_next_q_value = self.target_qf(
                                torch.Tensor([
                                    next_obs
                                ]))[0][max_next_q_value_index].numpy()
                            current_state = self.obs_queue.get()
                            current_action = self.action_queue.get()
                            q_value = self.qf(torch.Tensor(
                                [current_state]))[0][current_action].numpy()
                            td_error = abs(self.step_reward +
                                           max_next_q_value *
                                           (self.gamma**self.N_STEP) - q_value)
                            priority = td_error
                            self.temporal_memory.add(current_state,
                                                     current_action,
                                                     self.step_reward,
                                                     next_obs, priority,
                                                     terminal)
                            self.step_reward -= self.reward_queue.get()
                    while True and self.total_step % 50 == 0:
                        try:
                            if os.path.isfile(self.path):
                                #メモリを読み込む
                                trans_memory = torch.load(self.path)
                                #メモリファイルの削除
                                os.remove(self.path)
                                #メモリに追加
                                #vstackは一番深い層の要素同士を結合する(http://ailaby.com/vstack_hstack/)
                                #vstack = concatenate(axis = 0)
                                #hstack = concatenate(axis = 1)
                                temporal_memory_size = self.temporal_memory.get_memory_size(
                                )
                                trans_memory['obs'] = np.vstack(
                                    (trans_memory['obs'], self.temporal_memory.
                                     obs[:temporal_memory_size]))
                                trans_memory['action'] = np.vstack(
                                    (trans_memory['action'],
                                     self.temporal_memory.
                                     actions[:temporal_memory_size]))
                                trans_memory['reward'] = np.vstack(
                                    (trans_memory['reward'],
                                     self.temporal_memory.
                                     rewards[:temporal_memory_size]))
                                trans_memory['next_obs'] = np.vstack(
                                    (trans_memory['next_obs'],
                                     self.temporal_memory.
                                     next_obs[:temporal_memory_size]))
                                trans_memory['priority'] = np.hstack(
                                    (trans_memory['priority'],
                                     self.temporal_memory.
                                     priorities[:temporal_memory_size]))
                                trans_memory['terminate'] = np.vstack(
                                    (trans_memory['terminate'],
                                     self.temporal_memory.
                                     terminates[:temporal_memory_size]))
                                #メモリを保存
                                torch.save(trans_memory, self.path)
                                self.temporal_memory = ReplayMemory()
                                break
                            else:
                                trans_memory = dict()
                                temporal_memory_size = self.temporal_memory.get_memory_size(
                                )
                                trans_memory[
                                    'obs'] = self.temporal_memory.obs[:
                                                                      temporal_memory_size]
                                trans_memory[
                                    'action'] = self.temporal_memory.actions[:
                                                                             temporal_memory_size]
                                trans_memory[
                                    'reward'] = self.temporal_memory.rewards[:
                                                                             temporal_memory_size]
                                trans_memory[
                                    'next_obs'] = self.temporal_memory.next_obs[:
                                                                                temporal_memory_size]
                                trans_memory[
                                    'priority'] = self.temporal_memory.priorities[:
                                                                                  temporal_memory_size]
                                trans_memory[
                                    'terminate'] = self.temporal_memory.terminates[:
                                                                                   temporal_memory_size]
                                torch.save(trans_memory, self.path)
                                self.temporal_memory = ReplayMemory()
                                break
                        except:
                            #他のプロセスがファイルを開いている場合は、タイミングをずらして開く
                            sleep(np.random.random() * 2 + 2)
                obs = next_obs.copy()

                step += 1
                self.total_step += 1
                if self.total_step < self.initial_exploration:
                    continue

                if self.total_step % 50 == 0:
                    #Learnerに基づいたネットワークの更新
                    while True:
                        if os.path.isfile(self.model_path):
                            try:
                                self.qf.load_state_dict(
                                    torch.load(self.model_path))
                                self.target_qf.load_state_dict(
                                    torch.load(self.target_model_path))
                                break
                            except (FileNotFoundError, EOFError, RuntimeError):
                                sleep(np.random.random() * 2 + 2)

            self.ten_step += step
            if episode % 10 == 0:
                print('ID:', self.actor_index, ' episode:', episode, 'return:',
                      self.ten_step / 10.0, 'epsilon:', self.epsilon)
                self.ten_step = 0
Пример #14
0
class DriverAgent:
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DriverAgent'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Tensorflow Session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        # Actor & Critic Network
        self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE,
                                  TAU, LRA)
        self.critic = CriticNetwork(self.sess, state_dim, action_dim,
                                    BATCH_SIZE, TAU, LRA)

        # Replay Memory
        self.memory = ReplayMemory(MEMORY_SIZE)

        # Loss value
        self.loss = 0

        # loading networks. modify as you want
        self.saver = tf.train.Saver()
        if not os.path.exists(ckp_dir):
            print("Could not find old network weights")
        else:
            self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name))
            print("Successfully loaded:", ckp_name)

    # Train code
    def train(self, state, action, reward, next_state, done):
        # Add information to the replay memory
        if (not (math.isnan(reward))):
            self.memory.add(state, action, reward, next_state, done)

        if self.memory.count() <= START_REPLAY:
            return

        # Get batch from the replay memory
        batch = self.memory.getBatch(BATCH_SIZE)
        states = np.asarray([e[0] for e in batch])
        actions = np.asarray([e[1] for e in batch])
        rewards = np.asarray([e[2] for e in batch])
        new_states = np.asarray([e[3] for e in batch])
        dones = np.asarray([e[4] for e in batch])

        # Get target Q value of the critic network
        target_Q = self.critic.target_predict(
            [new_states, self.actor.target_predict(new_states)])

        # Calculate answer(???) < I cannot rememeber name
        y_t = []
        for i in range(len(batch)):
            if dones[i]:
                y_t.append(rewards[i])
            else:
                y_t.append(rewards[i] + GAMMA * target_Q[i])
        y_t = np.resize(y_t, [BATCH_SIZE, 1])

        # Calculate loss value and gradient for each network, and train both
        _, loss = self.critic.train([states, actions], y_t)

        a_for_grad = self.actor.predict(states)
        grads = self.critic.gradients(states, a_for_grad)

        self.actor.train(states, grads)

        self.actor.target_train()
        self.critic.target_train()

    # save your own network
    def saveNetwork(self, episode):
        if not os.path.exists(ckp_dir):
            os.mkdir(ckp_dir)
        ckp_name_real = ckp_name + '_' + str(episode)
        self.saver.save(self.sess, os.path.join(ckp_dir, ckp_name_real))
        pass

    def action(self, state):
        # return an action by state.
        action = np.zeros([self.action_dim])
        action_pre = self.actor.predict([state])

        # ACTION: without noise
        action[0] = np.clip(action_pre[0][0], -1, 1)
        action[1] = np.clip(action_pre[0][1], 0, 1)
        action[2] = np.clip(action_pre[0][2], 0, 1)

        return action

    def noise_action(self, state, epsilon):
        # return an action according to the current policy and exploration noise
        action = np.zeros([self.action_dim])
        noise = np.zeros([self.action_dim])

        action_pre = self.actor.predict([state])

        noise[0] = epsilon * OU.function(action_pre[0][0], 0.0, 0.80, 0.60)
        noise[1] = epsilon * OU.function(action_pre[0][1], 0.7, 1.00, 0.10)
        noise[2] = epsilon * OU.function(action_pre[0][2], -0.1, 1.00, 0.05)

        # ACTION: with noise
        action[0] = np.clip(action_pre[0][0] + noise[0], -1, 1)
        action[1] = np.clip(action_pre[0][1] + noise[1], 0, 1)
        action[2] = np.clip(action_pre[0][2] + noise[2], 0, 1)

        return action
Пример #15
0
def train(sess, env, args, actor, critic, actor_noise):

    summary_ops, summary_vars = build_summaries()
    init = tf.global_variables_initializer()
    sess.run(init)
    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    actor.update_target()
    critic.update_target()

    replayMemory = ReplayMemory(int(args['buffer_size']),
                                int(args['random_seed']))

    for i in range(int(args['max_episodes'])):

        s = env.reset()
        episode_reward = 0
        episode_av_max_q = 0
        #if i%50==0:
        #actor.mainModel.save('results/mountainCar'+str(i)+'.h5')
        #print("Saving Model now")

        for j in range(int(args['max_episode_len'])):
            if args['render_env']:
                env.render()

            a = actor.act(np.reshape(s, (-1, actor.state_dim)), actor_noise())
            s2, r, done, _ = env.step(a[0])
            replayMemory.add(np.reshape(s, (actor.state_dim, )),
                             np.reshape(a, (actor.action_dim, )), r, done,
                             np.reshape(s2, (actor.state_dim, )))

            if replayMemory.size() > int(args['minibatch_size']):
                s_batch, a_batch, r_batch, d_batch, s2_batch = replayMemory.miniBatch(
                    int(args['minibatch_size']))
                targetQ = critic.predict_target(s2_batch,
                                                actor.predict_target(s2_batch))
                yi = []
                for k in range(int(args['minibatch_size'])):
                    if d_batch[k]:
                        yi.append(r_batch[k])
                    else:
                        yi.append(r_batch[k] + critic.gamma * targetQ[k])
                critic.train(s_batch, a_batch,
                             np.reshape(yi, (int(args['minibatch_size']), 1)))

                actions_pred = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, actions_pred)
                actor.train(s_batch, grads)
                actor.update_target()
                critic.update_target()

            s = s2
            episode_reward += r
            if done:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]:
                                           episode_reward,
                                           summary_vars[1]:
                                           episode_av_max_q / float(j)
                                       })
                writer.add_summary(summary_str, i)
                writer.flush()
                print('|Reward: {:d}| Episode: {:d}'.format(
                    int(episode_reward), i))
                break
Пример #16
0
            action = qf.select_action(obs)
        epsilon -= 1e-4
        if epsilon < 0:
            epsilon = 0

        next_obs, reward, done, _ = env.step(action)

        terminal = 0
        reward = 0
        if done:
            terminal = 1
            if not step >= 195:
                reward = -1
        sum_reward += reward

        memory.add(obs, action, reward, next_obs, terminal)
        obs = next_obs.copy()

        step += 1
        total_step += 1
        if total_step < initial_exploration:
            continue

        batch = memory.sample()

        #各サンプルにおける状態行動の値を取ってくる
        q_value = qf(batch['obs']).gather(1, batch['actions'])

        #サンプルごとの処理を同時に行う
        with torch.no_grad():
            #Q-networkにおける最大値のインデックスを取ってくる
Пример #17
0
    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 0

    while True:
        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b = replayMemory.miniAll()

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre*0.6
Пример #18
0
def train():

    env = gym.make('LunarLander-v2')

    state = env.reset()

    actor = Actor(env.action_space, env.observation_space)

    critic = Critic(env.action_space, env.observation_space)

    actor.load()
    critic.load()

    replayMemory = ReplayMemory()

    summary_ops, summary_vars = build_summaries()

    writer = tf.summary.FileWriter("./log", tf.Session().graph)

    episode_reward = 0

    step = 1

    while True:

        #env.render()

        state1 = state[np.newaxis, :]

        action, action_matrix, prob = actor.predict(state1)

        next_state, reward, done, info = env.step(action)

        replayMemory.add(state, action_matrix, reward, done, next_state, prob)

        state = next_state

        episode_reward += reward

        #train
        if replayMemory.size() % 128 == 0 or done == True:

            state_b, action_matrix_b, reward_b, done_b, next_state_b, prob_b = replayMemory.miniAll(
            )

            reward_b = reward_b[:, np.newaxis]

            c_pre = critic.predict(next_state_b)

            state_pre_value = reward_b + c_pre * 0.7

            state_value = critic.predict(state_b)

            count = 5000 // step

            if count > 500:
                count = 500

            if count < 1:
                count = 1

            count = 10

            for _ in range(count):
                critic.train(state_b, state_pre_value)

            for _ in range(count):
                actor.train(state_b, state_value, state_pre_value,
                            action_matrix_b, prob_b)

            replayMemory.clear()
        ########################

        if done:

            summary_str = tf.Session().run(
                summary_ops, feed_dict={summary_vars[0]: episode_reward})
            writer.add_summary(summary_str, step)
            writer.flush()

            ##print("step = ", step, "episode_reward = ", episode_reward)

            state = env.reset()

            episode_reward = 0

            step += 1

            if step % 25 == 0:
                actor.save()
                critic.save()