Exemplo n.º 1
0
def train():

    runtime = 5.  # time limit of the episode
    init_pose = np.array([0., 0., 4.0, 0., 0., 0.0])  # initial pose
    init_velocities = np.array([0., 0., 0.0])  # initial velocities
    init_angle_velocities = np.array([0., 0., 0.])  # initial angle velocities
    file_output = 'rewards.txt'  # file name for saved results

    num_episodes = 10
    target_pos = np.array([0., 0., 40.])
    task = Task(init_pose=init_pose,
                init_velocities=init_velocities,
                init_angle_velocities=init_angle_velocities,
                target_pos=target_pos)
    agent = DDPG(task)

    labels = ['episod', 'avg_reward', 'total_reward']
    results = {x: [] for x in labels}

    with open(file_output, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(labels)
        best_total_reward = -1000
        for i_episode in range(1, num_episodes + 1):
            state = agent.reset_episode()  # start a new episode
            total_reward = 0
            rewards = []

            while True:

                # select action according to the learned policy and the exploration noise
                action = agent.act(state)
                # execute the action and observe the reward and the next state
                next_state, reward, done = task.step(action)

                # sample mini batch and learn
                agent.step(action, reward, next_state, done)

                # data tracking
                total_reward += reward
                rewards.append(reward)

                if total_reward > best_total_reward:
                    best_total_reward = total_reward

                state = next_state

                if done:
                    avg_reward = np.mean(np.array(rewards))
                    print(task.sim.pose)
                    #to_write = [task.sim.time] + list(task.sim.pose) + list(task.sim.v) + list(task.sim.angular_v) + list(rotor_speeds)
                    #for ii in range(len(labels)):
                    #    results[labels[ii]].append(to_write[ii])
                    #writer.writerow(to_write)

                    to_write = [i_episode] + [avg_reward] + [total_reward]
                    for ii in range(len(labels)):
                        results[labels[ii]].append(to_write[ii])
                    print(
                        "\rEpisode = {:4d}, total_reward = {:7.3f}, avg_reward={:7.3} (best = {:7.3f})"
                        .format(i_episode, total_reward, avg_reward,
                                best_total_reward),
                        end="")  # [debug]
                    break
            sys.stdout.flush()

    return agent
num_episodes = 1000
init_pose = np.array([0., 0., 0., 0., 0., 0.])
target_pos = np.array([0., 0., 10.])
init_velocities = np.array([0., 0., 0.])  # initial velocities
init_angle_velocities = np.array([0., 0., 0.])

task = Task(init_pose=init_pose,
            target_pos=target_pos,
            init_angle_velocities=init_angle_velocities,
            init_velocities=init_velocities)
best_score = -np.inf

agent = DDPG(task)

for i_episode in range(1, num_episodes + 1):
    state = agent.reset_episode()  # start a new
    score = 0
    while True:
        action = agent.act(state)
        next_state, reward, done = task.step(action)
        agent.step(action, reward, next_state, done)
        state = next_state
        score += reward
        best_score = max(best_score, score)
        if done:
            print("\rEpisode = {:4d}, score = {:7.3f} (best = {:7.3f})".format(
                i_episode, score, best_score),
                  end="")  # [debug]
            break
    sys.stdout.flush()