Exemplo n.º 1
0
def run_experiment():
    with mlflow.start_run():
        #         num_iterations = 1000
        mlflow.set_tag("agent_type", "dqn")
        mlflow.log_param("num_act_units", fc_layer_params)
        mlflow.log_param("num_iterations", num_iterations)
        mlflow.log_param("initial_collect_steps", initial_collect_steps)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.set_tag("data_set", "initial_dataset_after_pca")
        mlflow.log_param("discount", discount)
        mlflow.log_param("run", 1)

        agent.train = common.function(agent.train)

        agent.train_step_counter.assign(0)

        best_score = 0
        for _ in range(num_iterations):
            collect_data(train_env, agent.collect_policy, replay_buffer,
                         collect_steps_per_iteration)

            experience, unused_info = next(iterator)

            train_loss = agent.train(experience).loss

            step = agent.train_step_counter.numpy()

            if (step - 1) % log_interval == 0:
                print("step: ", step)
                mlflow.log_metric("loss", train_loss.numpy())

            if _ % eval_interval == 0:
                t = time.localtime()
                current_time = time.strftime("%H:%M:%S", t)
                print("\n")
                print(_, current_time)
                t_eval, u_eval, ratio_of_ones_eval = calculate_u_metric(
                    eval_df)
                print("\n")
                t_train, u_train, ratio_of_ones_train = calculate_u_metric(
                    train)

                mlflow.log_metrics({
                    "t_eval": t_eval,
                    "u_eval": u_eval,
                    "t_train": t_train,
                    "u_train": u_train,
                    "ratio_of_ones_eval": ratio_of_ones_eval,
                    "ratio_of_ones_train": ratio_of_ones_train
                })
                if u_eval > best_score:
                    best_score = u_eval
                    saver = PolicySaver(agent.policy, batch_size=None)
                    saver.save("dqn_policy")

        subprocess.run(["zip", "-r", "dqn_policy.zip", "dqn_policy"])
        mlflow.log_artifact("dqn_policy.zip")
Exemplo n.º 2
0
  def save_policy(self, step):
    """Save strong policy with tf-agent PolicySaver."""
    print('Saving agent policy.')

    # saving environment params as metadata in order to reconstruct environment
    metadata = py_to_tf(self.env_params)
    saver = PolicySaver(self.agent.policy,
                        train_step=self.agent.train_step_counter,
                        metadata=metadata,
                        batch_size=None)
    dir_name = f'{self.uid.numpy().decode()}-{step}'
    filepath = os.path.join(configs.POLICY_DIR, dir_name)
    saver.save(filepath)
    print('Policy saved.')
Exemplo n.º 3
0
def main():
    all_returns = []

    for _ in range(NUM_RUNS):
        agent, train_env, evaluation_env, experience_replay = init()

        returns = training_loop(agent, train_env, evaluation_env,
                                experience_replay)

        all_returns.append(returns)

    # save policy
    PolicySaver(agent.policy).save('policy_saved')

    steps_axis = [i for i in range(0, NUMBER_ITERATION + 1, EVAL_INTERVAL)]
    plt.figure()
    for i in range(NUM_RUNS):
        plt.plot(steps_axis, all_returns[i])
    plt.xlabel('Time steps')
    plt.ylabel('Average return')
    plt.title("Rewards over 5 runs")

    for i in range(NUM_RUNS):
        plt.figure()
        plt.plot(steps_axis, all_returns[i])
        plt.title("Rewards overall")
        plt.xlabel('Time steps')
        plt.ylabel('Average return')
    plt.show()
Exemplo n.º 4
0
def train_tf_agent(
    model: typing.Union[TFAgent, typing.Type[TFAgent]],
    env: gym.Env,
    total_timesteps: int,
    model_name: typing.Optional[str] = None,
    maximum_episode_reward: int = 200,
    stop_training_threshold: int = 195,
):
    train_env = environment_converter.gym_to_tf(env)
    environment_name = env.__class__.__name__
    model_dir = f"{kindo.paths.save_path}/{environment_name}/{model_name}"
    Path(model_dir).mkdir(parents=True, exist_ok=True)

    stop_training_callback = callbacks.StopTrainingWhenMean100EpReward(
        reward_threshold=stop_training_threshold)
    history_saving_callback = callbacks.HistorySavingCallback(
        total_timesteps=total_timesteps,
        history_save_dir=model_dir,
        maximum_episode_reward=maximum_episode_reward,
        stop_callback=stop_training_callback,
    )

    if isinstance(model, ABCMeta):
        model = initialize_tf_agent(model_class=model, train_env=train_env)

    if model.__class__ in [
            agents.DqnAgent,
            DdqnAgent,
            agents.DdpgAgent,
            agents.SacAgent,
    ]:
        train_off_policy_tf_agent(model, train_env, total_timesteps,
                                  history_saving_callback)
    elif model.__class__ in [
            agents.PPOAgent, agents.ReinforceAgent, agents.Td3Agent
    ]:
        train_on_policy_tf_agent(model, train_env, total_timesteps,
                                 history_saving_callback)
    else:
        raise WrongModelError(
            f"Model of class `{model.__class__.__name__}` is not supported by Kindo API"
        )

    collect_policy = model.collect_policy
    saver = PolicySaver(collect_policy, batch_size=None)
    saver.save(f"{model_dir}/model")
Exemplo n.º 5
0
 def train_agent(n_iterations):
     saver = PolicySaver(agent.policy, batch_size=tf_env.batch_size)
     time_step = None
     policy_state = agent.collect_policy.get_initial_state(
         tf_env.batch_size)
     iterator = iter(dataset)
     for iteration in tqdm(range(n_iterations)):
         time_step, policy_state = collect_driver.run(
             time_step, policy_state)
         trajectories, buffer_info = next(iterator)
         train_loss = agent.train(trajectories)
         if iteration % 1000 == 0:
             print("\r{} loss:{:.5f}".format(iteration,
                                             train_loss.loss.numpy()),
                   end="")
             log_metrics(train_metrics)
         # save the policy each 10K iteration
         if iteration % 100000 == 0:
             saver.save('policy_%d' % iteration)
Exemplo n.º 6
0
def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(initial_policy, n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f} done:{:.5f}".format(
            iteration, train_loss.loss.numpy(),
            iteration / n_iterations * 100.0),
              end="")
        if iteration % 1000 == 0:
            log_metrics(train_metrics)
        if iteration % 10000 == 0 and iteration > 0:
            #keras.saved_model.saved_model(my_policy, 'policy_' + str(iteration))
            #tf.saved_model.save(agent, 'policy_' + str(iteration))
            my_policy = agent.policy
            saver = PolicySaver(my_policy)
            saver.save('policy_' + str(iteration))
Exemplo n.º 7
0
def main():
    agent, train_env, evaluation_env, experience_replay = init()

    returns = training_loop(agent, train_env, evaluation_env,
                            experience_replay)

    # save policy
    PolicySaver(agent.policy).save('policy_saved')

    plt.plot(returns)
    plt.title("Rewards overall")
    plt.show()
Exemplo n.º 8
0
def main():
    agent, train_env, evaluation_env, experience_replay = init()

    returns = training_loop(agent, train_env, evaluation_env,
                            experience_replay)

    # save policy
    PolicySaver(agent.policy).save('policy_saved')

    plt.plot([i for i in range(0, NUMBER_ITERATION + 1, EVAL_INTERVAL)],
             returns)
    plt.title("Rewards overall")
    plt.show()
Exemplo n.º 9
0
                add_metrics(training_info, train_metrics)

    train_agent(n_iterations=n_iterations)

    # c) For storing frames
    def get_vid_frames(policy, filename, num_episodes=20, fps=2):
        frames = []
        for _ in range(num_episodes):
            time_step = tf_env.reset()
            frames.append(np.abs(env.get_board()) * 100)
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = tf_env.step(action_step.action)
                frames.append(np.abs(env.get_board()) * 100)
        return frames

    # Store Data
    df = pd.DataFrame(np.array(training_info).T,
                      columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM'])
    df.to_csv('../DATA/stats_{}.txt'.format(II), index=False, mode="a")

    # Store Frames
    frames = get_vid_frames(agent.policy, "trained-agent")
    with open('../DATA/frames_{}.pkl'.format(II), 'wb') as f:
        pickle.dump(frames, f)

    # Store Model
    my_policy = agent.policy
    saver = PolicySaver(my_policy, batch_size=None)
    saver.save('..\\DATA\\policy_{}'.format(II))
Exemplo n.º 10
0
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()

    # Print loss every 200 steps.
    if step % 200 == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    # Evaluate agent's performance every 1000 steps.
    if step % 1000 == 0:
        avg_return = compute_avg_return(env, agent.policy, 5)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns.append(avg_return)

pd.DataFrame(returns).plot()
print(returns)
plt.show()

for num in range(10):
    step = env.reset()
    time_step = env.current_time_step()
    while not time_step.is_last().numpy()[0]:
        time_step = env.current_time_step()
        print(time_step.observation.numpy())
        action_step = agent.policy.action(time_step)
        next_time_step = env.step(action_step.action)

    print("end")

PolicySaver(agent.policy).save('temp')
def train_eval_doom_simple(
		# Params for collect
		num_environment_steps=100000,
		collect_episodes_per_iteration=32,
		num_parallel_environments=1,
		replay_buffer_capacity=301,  # Per-environment
		# Params for train
		num_epochs=25,
		learning_rate=4e-4,
		# Params for eval
		eval_interval=10,
		num_video_episodes=10,
		# Params for summaries and logging
		log_interval=10):
	"""A simple train and eval for PPO."""
	# if not os.path.exists(videos_dir):
	# 	os.makedirs(videos_dir)
	global terminate
	eval_py_env = CarlaEnv()
	tf_env = tf_py_environment.TFPyEnvironment(eval_py_env)

	actor_net, value_net = create_networks(tf_env.observation_spec(), tf_env.action_spec())

	global_step = tf.compat.v1.train.get_or_create_global_step()
	optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate, epsilon=1e-5)

	tf_agent = ppo_agent.PPOAgent(
		tf_env.time_step_spec(),
		tf_env.action_spec(),
		optimizer,
		actor_net,
		value_net,
		num_epochs=num_epochs,
		train_step_counter=global_step,
		discount_factor=0.99,
		gradient_clipping=0.5,
		entropy_regularization=1e-2,
		importance_ratio_clipping=0.2,
		use_gae=True,
		use_td_lambda_return=True
	)
	tf_agent.initialize()

	environment_steps_metric = tf_metrics.EnvironmentSteps()
	step_metrics = [
		tf_metrics.NumberOfEpisodes(),
		environment_steps_metric,
	]

	replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity)
	train_replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(tf_agent.collect_data_spec, batch_size=num_parallel_environments, max_length=replay_buffer_capacity)
	collect_driver = dynamic_episode_driver.DynamicEpisodeDriver(tf_env, tf_agent.collect_policy, observers=[replay_buffer.add_batch] + step_metrics, num_episodes=collect_episodes_per_iteration)

	collect_time = 0
	train_time = 0
	timed_at_step = global_step.numpy()
	
	my_policy = tf_agent.policy
	saver = PolicySaver(my_policy, batch_size=None)

	def train_step():
		trajectories = train_replay_buffer.gather_all()
		return tf_agent.train(experience=trajectories)
	
	def evaluate(policy, step_count):
		create_video(tf_env, policy, 10, f'agent/behave/imageio_{step_count}.mp4')


	print("collecting samples initial:")
	collect_driver.run()
	train_replay_buffer = copy.deepcopy(replay_buffer)
	replay_buffer.clear()
	print(f"train size {train_replay_buffer.num_frames()} buffer size{replay_buffer.num_frames()}")

	while environment_steps_metric.result() < num_environment_steps and not terminate:
		start_time = time.time()
		print("collecting samples")
		collector_thread = threading.Thread(target=collect_driver.run)
		collector_thread.start()

		start_time = time.time()
		count = 0
		# while collector_thread.is_alive() and not terminate:
		# 	count = count + 1
		print(f"Training agent {count}")
		total_loss, _ = train_step()
		print()
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print(f'step = {global_step.numpy()}, loss = {total_loss}, env_metric = {environment_steps_metric.result()}')
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print()
		train_replay_buffer.clear()
		print("Training agent Finshed")
		print("Waiting for collecting samples thread")
		collector_thread.join()
		print("collecting samples Finished")
		collect_time += time.time() - start_time
		train_replay_buffer = copy.deepcopy(replay_buffer)
		replay_buffer.clear()
		train_time += time.time() - start_time

		global_step_val = global_step.numpy()

		print(f"global_step_val:{global_step_val} % log_interval:{log_interval} = {global_step_val % log_interval}")

		# if global_step_val % log_interval == 0:
		print()
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print(f'step = {global_step_val}, loss = {total_loss}')
		steps_per_sec = ((global_step_val - timed_at_step) / (collect_time + train_time))
		print(f'{steps_per_sec} steps/sec')
		print(f'collect_time = {collect_time}, train_time = {train_time}')
		print("'''''''''''''''''''''''''''''''''''Tensorflow logs:'''''''''''''''''''''''''''''''''''")
		print()
		timed_at_step = global_step_val
		collect_time = 0
		train_time = 0

		if global_step_val % eval_interval == 0:
			print("Evaluating!!")
			saver.save(f'agent/saved/policy_ppo_simple_{global_step_val}')
			policy = tf_agent.policy
			evaluate(policy, global_step_val)

	print("Terminated")
	policy = tf_agent.policy
	evaluate(policy, global_step_val)
Exemplo n.º 12
0
    if prev_lives != lives:
        tf_env.reset()
        tf_env.pyenv.envs[0].step(np.array(1))
        prev_lives = lives


watch_driver = DynamicStepDriver(
    tf_env,
    agent.policy,
    observers=[save_frames, reset_and_fire_on_life_lost,
               ShowProgress(1000)],
    num_steps=1000)
final_time_step, final_policy_state = watch_driver.run()

plot_animation(frames)

# Create updated animated gif of agent in action
image_path = os.path.join(PROJECT_ROOT_DIR, "myAgentPlays.gif")
frame_images = [PIL.Image.fromarray(frame) for frame in frames[:150]]
frame_images[0].save(image_path,
                     format='GIF',
                     append_images=frame_images[1:],
                     save_all=True,
                     duration=30,
                     loop=0)

# Save policy and model
policy_dir = os.path.join(PROJECT_ROOT_DIR, "savedPolicy")
tf_policy_saver = PolicySaver(agent.policy)
tf_policy_saver.save(policy_dir)
Exemplo n.º 13
0
def train_and_evaluate_ACagent(tf_agent,
                               train_env=None,
                               eval_env=None,
                               num_iterations=None,
                               batch_size=32,
                               replay_buffer_capacity=1000,
                               name='agent'):

    if train_env is None:
        raise ValueError(
            "train_env is None! Environment should be implemented")

    if eval_env is None:
        raise ValueError(
            "eval_env is None! Environment for evaluation should be implemented"
        )

    if num_iterations is None:
        raise ValueError("Number of iterations should be implemented!")

    tf_agent.initialize()

    initial_collect_steps = 1
    collect_steps_per_iteration = 1

    print('Initial collect step is', initial_collect_steps)
    print('collect steps per iteration', collect_steps_per_iteration)
    print('batch size is ', batch_size)
    print('replay buffer capacity is', replay_buffer_capacity)

    eval_policy = tf_agent.policy
    collect_policy = gaussian_policy.GaussianPolicy(tf_agent.collect_policy)

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=1,
        max_length=replay_buffer_capacity)

    initial_collect_driver = dynamic_step_driver.DynamicStepDriver(
        train_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=initial_collect_steps)

    initial_collect_driver.run()

    collect_driver = dynamic_step_driver.DynamicStepDriver(
        train_env,
        collect_policy,
        observers=[replay_buffer.add_batch],
        num_steps=collect_steps_per_iteration)

    tf_agent.train = common.function(tf_agent.train)
    collect_driver.run = common.function(collect_driver.run)

    # Reset the train step
    tf_agent.train_step_counter.assign(0)

    # Evalute the agent's policy once before training
    avg_return = compute_avg_return(eval_env, eval_policy)
    train_return = compute_avg_return(train_env, eval_policy)
    returns = [(0, avg_return)]
    losses = []
    train_returns = [train_return]

    for _ in range(num_iterations):
        # Collect a few steps using collect_policy and svae to the replay buffer.
        for _ in range(collect_steps_per_iteration):
            collect_driver.run()

        dataset = replay_buffer.as_dataset(batch_size, 2)
        iterator = iter(dataset)
        # Sample a batch of data from the buffer and update the agent's network
        experience, _ = next(iterator)
        train_loss = tf_agent.train(experience)

        step = tf_agent.train_step_counter.numpy()

        log_interval = 50
        eval_interval = 50

        if step % log_interval == 0:
            print('step = {0}: loss = {1}'.format(step, train_loss.loss))
            losses.append((step, train_loss.loss))

        if step % eval_interval == 0:
            eval_policy = tf_agent.policy
            avg_return = compute_avg_return(eval_env, eval_policy)
            train_avg_return = compute_avg_return(train_env, eval_policy)

            print('step = {0}: Average Return = {1}'.format(step, avg_return))
            returns.append((step, avg_return))
            train_returns.append(train_avg_return)

    saver = PolicySaver(tf_agent.policy, batch_size=None)
    saver.save(r'C:\Users\DELL\Desktop\Python\\' + name + "policy_%d" % step)

    steps_list = [r[0] for r in returns]
    rewards_list = [r[1] for r in returns]
    loss_steps_list = [l[0] for l in losses]
    loss_list = [l[1] for l in losses]

    return steps_list, rewards_list, name, loss_steps_list, loss_list, train_returns
Exemplo n.º 14
0
def train_sac_agent(
    env_factory,
    batch_size=128,
    reward_scale_factor=1.0,
    total_training_steps=1000000,
    eval_callback_rate=None,
    eval_callback=None,
    avg_return_report_rate=1000,
    initial_collect_steps=10000,
    training_iteration_collect_steps=1,
    replay_buffer_size=120000,
    num_eval_episodes=3,
    checkpoint_dir=None,
    latest_policy_dir=None,
    best_policy_dir=None,
    tensorboard_dir=None,
    latest_policy_save_rate=5000,
    checkpoint_save_rate=20000,
):
    train_env = as_tf_env(env_factory())
    eval_env = as_tf_env(env_factory())

    agent = create_sac_agent(train_env, reward_scale_factor)
    agent.train = common.function(agent.train)
    agent.train_step_counter.assign(0)

    eval_policy = greedy_policy.GreedyPolicy(agent.policy)

    replay_buffer = create_replay_buffer(agent, train_env, replay_buffer_size)

    collect_driver = create_collect_driver(
        train_env,
        agent,
        replay_buffer,
        collect_steps=training_iteration_collect_steps)
    collect_driver.run = common.function(collect_driver.run)

    initial_collect_driver = create_collect_driver(
        train_env, agent, replay_buffer, collect_steps=initial_collect_steps)
    initial_collect_driver.run()

    dataset = replay_buffer.as_dataset(num_parallel_calls=2,
                                       sample_batch_size=batch_size,
                                       num_steps=2).prefetch(1)
    dataset_iter = iter(dataset)

    if checkpoint_dir is None:
        checkpoint_dir = tempfile.mkdtemp()
    print('Checkpoints will be stored in {0}'.format(checkpoint_dir))
    train_checkpointer = common.Checkpointer(
        ckpt_dir=checkpoint_dir,
        max_to_keep=1,
        agent=agent,
        policy=agent.policy,
        replay_buffer=replay_buffer,
        global_step=agent.train_step_counter,
    )
    train_checkpointer.initialize_or_restore()

    if latest_policy_dir is None:
        latest_policy_dir = tempfile.mkdtemp()
    print('Learned policies will be stored in {0}'.format(latest_policy_dir))
    latest_policy_saver = PolicySaver(eval_policy)

    if best_policy_dir is None:
        best_policy_dir = tempfile.mkdtemp()
    print('Learned policies will be stored in {0}'.format(best_policy_dir))
    best_policy_saver = PolicySaver(eval_policy)

    if tensorboard_dir is None:
        tensorboard_dir = tempfile.mkdtemp()
    print('Tensorboard logs will be stored in {0}'.format(tensorboard_dir))
    writer = tf.summary.create_file_writer(tensorboard_dir)

    with writer.as_default():
        avg_return, avg_num_steps = evaluate_policy(
            eval_env, eval_policy, num_episodes=num_eval_episodes)
        tf.summary.scalar('Average return', avg_return, step=0)
        tf.summary.scalar('Average number of steps', avg_num_steps, step=0)
        best_avg_return = avg_return

        for _ in range(total_training_steps):
            collect_driver.run()

            experience, _ = next(dataset_iter)
            agent.train(experience)
            step = agent.train_step_counter.numpy()

            if step % avg_return_report_rate == 0:
                avg_return, avg_num_steps = evaluate_policy(
                    eval_env, eval_policy, num_episodes=num_eval_episodes)
                tf.summary.scalar('Average return', avg_return, step=step)
                tf.summary.scalar('Average number of steps',
                                  avg_num_steps,
                                  step=step)

                if avg_return > best_avg_return:
                    best_avg_return = avg_return
                    best_policy_saver.save(best_policy_dir)

            if eval_callback_rate is not None and step % eval_callback_rate == 0:
                eval_callback(eval_env, eval_policy)

            if latest_policy_save_rate is not None and step % latest_policy_save_rate == 0:
                latest_policy_saver.save(latest_policy_dir)

            if checkpoint_save_rate is not None and step % checkpoint_save_rate == 0:
                train_checkpointer.save(agent.train_step_counter)

    return agent
Exemplo n.º 15
0
                                      num_steps=STEPS_PER_ITER)
    # Wrap the run function in a TF graph
    random_driver.run = common.function(random_driver.run)

    # Create a checkpointer
    checkpointer = common.Checkpointer(ckpt_dir=os.path.relpath('checkpoint'),
                                       max_to_keep=1,
                                       agent=agent,
                                       policy=agent.policy,
                                       replay_buffer=replay_buffer,
                                       global_step=global_step)
    checkpointer.initialize_or_restore()
    global_step = tf.compat.v1.train.get_global_step()

    # Create a policy saver
    policy_saver = PolicySaver(agent.policy)

    # Main training loop
    time_step, policy_state = None, None
    for it in range(N_ITERATIONS):
        if COLLECT_RANDOM:
            print('Running random driver...')
            time_step, policy_state = random_driver.run(time_step, policy_state)
        print('Running agent driver...')
        time_step, policy_state = driver.run(time_step, policy_state)
        print('Training...')
        for train_it in range(BUFFER_LENGTH//BATCH_SIZE):
            experience, _ = replay_buffer.get_next(sample_batch_size=BATCH_SIZE, num_steps=2)
            agent.train(experience)
            if (train_it + 1) % 100 == 0:
                print('{0} training iterations'.format(train_it + 1))
Exemplo n.º 16
0
    initial_collect_policy,
    observers=[replay_buffer.add_batch,
               ShowProgress(20000)],
    num_steps=20000)

final_time_step, final_policy_state = init_driver.run()

dataset = replay_buffer.as_dataset(sample_batch_size=64,
                                   num_steps=2,
                                   num_parallel_calls=3).prefetch(3)

#collect_driver.run = function(collect_driver.run)
#agent.train = function(agent.train)

my_policy = agent.collect_policy
saver = PolicySaver(my_policy, batch_size=None)


def train_agent(n_iterations):
    time_step = None
    policy_state = agent.collect_policy.get_initial_state(tf_env.batch_size)
    iterator = iter(dataset)
    for iteration in range(initial_policy, n_iterations):
        time_step, policy_state = collect_driver.run(time_step, policy_state)
        trajectories, buffer_info = next(iterator)
        train_loss = agent.train(trajectories)
        print("\r{} loss:{:.5f} done:{:.5f}".format(
            iteration, train_loss.loss.numpy(),
            iteration / n_iterations * 100.0),
              end="")
        if iteration % 1000 == 0:
Exemplo n.º 17
0
    start_time = time.time()
    for _ in range(NUM_ITERATIONS):
        driver.run()

        experience, unused_info = next(iterator)
        train_loss = agent.train(experience).loss
        step = agent.train_step_counter.numpy()

        if step % 200 == 0:
            print(f'Step {step}: loss = {train_loss}')
        if step % 1000 == 0:
            avg_return = compute_avg_return(eval_env, agent.policy)
            current_time = time.time()
            elapsed_time = current_time - start_time
            print(f'Step {step}, Time: {elapsed_time} : Average Return = {avg_return}')
            PolicySaver(agent.policy).save(f'parallel_policies/step_{step}')
            returns.append(avg_return)

    # Graph results
    iterations = range(0, NUM_ITERATIONS + 1, 1000)
    plt.plot(iterations, returns)
    plt.ylabel('Average Return')
    plt.xlabel('Iterations')
    plt.show()






Exemplo n.º 18
0
def save_policy(tf_agent, fname='/content/reinforce_cz'):
    my_policy = tf_agent.collect_policy
    # save policy
    PolicySaver(my_policy).save(fname)
Exemplo n.º 19
0
class PPOTrainer:
    """
    A PPO trainer for tf-agents.  Uses PPO agent objects with TensorFlow environments to train agents to maxinimize
    reward in their environments.

    Arguments:
        1. ppo_agent (PPO agent): A PPO agent used for learning in the environment env.
        2. train_env (tf env): A TensorFlow environment that the agent interacts with via the neural networks.  Used for
                               creating training trajectories for the agent, and for optimizing its networks.
        3. eval_env (tf env): A TensorFlow environment that the agent interacts with via the neural networks.  Used for
                              evaluating the performance of the agent.

        5. use_tensorboard (bool): Whether or not to plot losses with tensorboard.
        4. add_training_to_video (bool): Whether or not to create videos of the agent's training and save them as videos.
    """
    def __init__(self,
                 ppo_agent,
                 train_env,
                 eval_env,
                 use_tensorboard=True,
                 add_training_to_video=True):

        # Environment attributes
        self.train_env = train_env  # Environment for training
        self.eval_env = eval_env  # Environment for testing

        # Agent attributes
        self.agent = ppo_agent  # An instance of a tf-agents agent
        self.actor_net = self.agent._actor_net
        self.value_net = self.agent._value_net
        self.eval_policy = self.agent.policy
        self.collect_policy = self.agent.collect_policy

        # Specifics of training
        self.max_buffer_size = 1000  # Collect entire memory buffer each time
        self.collect_steps_per_iteration = 1000  # Collect entire memory buffer each time
        self.epochs = 10000  # Total number of episodes
        self.total_steps = self.epochs * self.collect_steps_per_iteration
        print("Total steps: {}".format(self.total_steps))

        # Evaluation
        self.num_eval_episodes = 5  # How many episodes we evaluate each time
        self.eval_returns = []  # Keep track of evaluation performance
        self.eval_interval = 100  # Evaluate every <x> epochs
        self.max_eval_episode_steps = 1000  # Most steps we can have in an episode

        # Logging
        self.time_ext = datetime.now().strftime("%Y%m%d-%H%M%S")
        self.log_interval = 1
        self.policy_save_dir = os.path.join(
            os.getcwd(), "logging_{}/".format(self.time_ext))
        if not os.path.exists(self.policy_save_dir):
            print("Directory {} does not exist; creating it now".format(
                self.policy_save_dir))
            os.mkdir(self.policy_save_dir)
        self.video_train = []
        self.add_training_to_video = add_training_to_video
        self.video_eval = []

        # Tensorboard
        self.log_dir = "./tb_log_{}".format(
            self.time_ext)  # Log directory for tensorboard
        self.train_file_writer = tf.summary.create_file_writer(
            self.log_dir)  # File writer for tf
        if not os.path.exists(self.log_dir):
            os.mkdir(self.log_dir)
        self.use_tensorboard = use_tensorboard  # Boolean for whether or not we use tensorboard for plotting

        # Create a replay buffer
        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.max_buffer_size)

        # Get train and evaluation policy savers
        self.train_saver = PolicySaver(self.collect_policy, batch_size=None)
        self.eval_saver = PolicySaver(self.eval_policy, batch_size=None)

        # Specify directories for training and evaluation policies
        self.policy_save_dir = os.path.join(os.getcwd(), "models",
                                            self.time_ext)
        self.save_interval = 500  # Save every 100 epochs
        if not os.path.exists(self.policy_save_dir):
            print("Directory {} does not exist;"
                  " creating it now".format(self.policy_save_dir))
            os.makedirs(self.policy_save_dir, exist_ok=True)

    def make_checkpoints(self):
        """Function for creating checkpoints to save model and track progress."""
        global_step = tf.compat.v1.train.get_or_create_global_step()

        # Create a checkpoint for training
        self.train_checkpointer = common_utils.Checkpointer(
            ckpt_dir=self.policy_save_dir,
            agent=self.agent,
            global_step=global_step)

        # Create a readback checkpointer
        self.rb_checkpointer = common_utils.Checkpointer(
            ckpt_dir=os.path.join(train_dir, 'replay_buffer'),
            max_to_keep=1,
            replay_buffer=self.replay_buffer)

    def collect_step(self, add_to_video=False, step=0, epoch=0):
        """
        Function for collecting a single step from the environment.  Used for adding trajectories to the replay
        buffer.  Resets on the first time step - indicating the start of a new episode.

        Arguments:
            1. add_to_video (bool): Whether or not to create a video of the training trajectories and save it to the
                                    'logging/' directory.
            2. step (int): The current step of the episode.  Important for determining whether or not the environment
                           needs to be reset and for tracking the training trajectories in tensorboard (if tensorboard
                           plotting is enabled).
            3. epoch (int): The current epoch of training.  Used for tracking the training trajectories in tensorboard
                            (if tensorboard plotting is enabled).
        """
        # Get current time step
        if step == 0:  # Reset the environment
            time_step = self.train_env.reset()
        else:  # Take the most recent time step
            time_step = self.train_env.current_time_step()

        # Take action using the collect policy
        action_step = self.collect_policy.action(time_step)

        # Compute the next time step by stepping the training environment
        next_time_step = self.train_env.step(action_step.action)

        # Create trajectory and write it to replay buffer
        traj = trajectory.from_transition(time_step, action_step,
                                          next_time_step)
        self.replay_buffer.add_batch(traj)

        # Log to tensorboard, if enabled
        if self.use_tensorboard:
            with self.train_file_writer.as_default():
                tf.summary.image(
                    "Training Trajectories, Epoch {}".format(epoch),
                    time_step.observation,
                    step=step)

        # Add observation to video, if enabled
        if add_to_video:
            # print(time_step.observation.numpy().shape)
            self.video_train.append(time_step.observation.numpy())

    def collect_episode(self, add_to_video=False, epoch=0):
        """
        Function for generating experience data for the replay buffer.  Calls collect_step() above to add trajectories
        from the environment to the replay buffer in an episodic fashion.  Trajectories from the replay buffer are then
        used for training the agent.

        Arguments:
            1. add_to_video (bool): Whether or not to create a video of the training trajectories and save it to the
                                    'logging/' directory.
            2. epoch (int): The current epoch of training.  Used for tracking the training trajectories in tensorboard
                            (if tensorboard plotting is enabled).
        """
        # Iteratively call collect_step method above to add trajectories to replay buffer
        for i in range(self.collect_steps_per_iteration):
            self.collect_step(add_to_video=add_to_video, step=i, epoch=epoch)

    def compute_avg_reward(self, epoch=None):
        """
        Function for computing the average reward over a series of evaluation episodes
        by creating simulation episodes using the agent's current policies,
        then computing rewards from taking actions using the evaluation (greedy) policy and averaging them.

        Arguments:
            1. epoch (int): The current epoch of training.  Used for tracking the training trajectories in tensorboard
                            (if tensorboard plotting is enabled).

        Returns:
            1. episode_return (float): A float representing the average reward over the interval of
            episodes which the agent's policies are evaluated.
        """
        total_return = 0.0
        for _ in range(self.num_eval_episodes):
            time_step = self.eval_env.reset()

            # Set step counter - capped at self.max_eval_episode_steps
            i = 0

            # Add to value in loop
            episode_return = 0.0

            while not time_step.is_last() and i < self.max_eval_episode_steps:
                action_step = self.eval_policy.action(time_step)
                self.video_eval.append(
                    time_step.observation.numpy())  # Add to video frame
                time_step = self.eval_env.step(action_step.action)

                # Log to tensorboard
                if self.use_tensorboard:
                    with self.train_file_writer.as_default():
                        try:
                            tf.summary.image(
                                "Eval Trajectories, Epoch {}".format(epoch),
                                time_step.observation,
                                step=i)
                        except:
                            print(
                                "Please provide an input for the epoch number."
                            )

                episode_return += time_step.reward
                if i % 250 == 0:
                    print("Action: {}, Reward: {}".format(
                        action_step.action.numpy(), episode_return))
                i += 1
            print("Steps in episode: {}".format(i))
            total_return += episode_return
        avg_return = total_return / self.num_eval_episodes

        print("Average return: {}".format(avg_return))
        self.eval_returns.append(avg_return)
        return avg_return

    def train_agent(self):
        """
        Function for training a PPO tf-agent using trajectories from the replay buffer.  Does initial evaluation of the
        agent prior to training, and then iterates over epochs of the following procedure:

            a. Collect an episode of data, and write the trajectories to the replay buffer.
            b. Train from the trajectories on the replay buffer.  Updates the weights of the actor and value networks.
            c. Empty the replay buffer.
            d. (If enabled) Save data to disk for tensorboard.
            e. Depending on epoch number and the evaluation and logging intervals, evaluate the agent or log information.

        Returns:
            1. agent (PPO agent): The PPO agent trained during the training process
        """
        eval_epochs = []

        # Optimize by wrapping some of the code in a graph using TF function.
        self.agent.train = common.function(self.agent.train)
        self.agent.train_step_counter.assign(0)
        avg_return = self.compute_avg_reward(
            epoch=0)  # Compute pre-training metrics

        # Log average reward to tensorboard
        if self.use_tensorboard:
            with self.train_file_writer.as_default():
                tf.summary.scalar("Avg. Reward", float(avg_return), step=0)

        print("DONE WITH PRELIMINARY EVALUATION...")
        # Append for output plot
        eval_epochs.append(0)
        self.video_eval = []  # Empty to create a new eval video
        returns = [avg_return]

        time_step = self.train_env.reset()

        # Episode counter
        i = 0
        for i in range(self.epochs):
            print("Training epoch: {}".format(i))

            # Collect data and train agent; clear buffer at end
            print("COLLECTING EPISODE")
            # Reset the old training video
            self.video_train = []
            self.collect_episode(add_to_video=self.add_training_to_video,
                                 epoch=i)
            self.create_video(mode='train', ext=i)
            print("COLLECTED EPISODE")
            trajectories = self.replay_buffer.gather_all()

            # Old weights
            old_vnet = copy.deepcopy(
                self.agent._value_net.trainable_variables[0])
            old_anet = copy.deepcopy(
                self.agent._actor_net.trainable_variables[0])

            # Take training step
            train_loss = self.agent.train(experience=trajectories)

            # Log loss to tensorboard
            if self.use_tensorboard:
                with self.train_file_writer.as_default():
                    tf.summary.scalar("Training Loss",
                                      float(train_loss.loss),
                                      step=i)

            # Get new weights
            new_vnet = copy.deepcopy(
                self.agent._value_net.trainable_variables[0])
            new_anet = copy.deepcopy(
                self.agent._actor_net.trainable_variables[0])

            # Display Frobenius norm
            print("VALUE NET Frobenius Norm Difference: {}".format(
                tf.norm(old_vnet - new_vnet)))
            print("ACTOR NET Frobenius Norm Difference: {}".format(
                tf.norm(old_anet - new_anet)))

            # Step the counter, and log/evaluate agent
            step = self.agent.train_step_counter.numpy()

            if self.epochs % self.log_interval == 0:
                print('step = {0}: loss = {1}'.format(step, train_loss.loss))

            if (i + 1) % self.eval_interval == 0:

                avg_return = self.compute_avg_reward(epoch=i)

                # Log average reward to tensorboard
                if self.use_tensorboard:
                    with self.train_file_writer.as_default():
                        tf.summary.scalar("Avg. Reward",
                                          float(avg_return),
                                          step=i)

                eval_epochs.append(i + 1)
                print('epoch = {0}: Average Return = {1}'.format(
                    step, avg_return))
                returns.append(avg_return)
                self.create_video(mode='eval', ext=i)
                self.video_eval = []  # Empty to create a new eval video

            # We should save checkpoints every save_interval epochs
            if i % self.save_interval == 0 and i != 0:
                self.save_policy(epochs_done=i)
                print("Epochs: {}".format(i))

            self.replay_buffer.clear()

        # At the end of training, return the agent
        return self.agent

    def playback_trajectories(self, recdir=None):

        counts = []

        def handle_ep(observations, actions, rewards):
            counts[0] += 1
            counts[1] += observations.shape[0]
            logger.debug(
                'Observations.shape={}, actions.shape={}, rewards.shape={}',
                observations.shape, actions.shape, rewards.shape)

        if recdir is None:
            print(
                "Error: Please specify a recording directory by calling gym_env.directory"
            )
        else:
            scan_recorded_traces(recdir, handle_ep)

    def create_video(self, mode='eval', ext=0):
        if mode == 'eval':
            video = self.video_eval
        elif mode == 'train':
            video = self.video_train
        # Check if video is zero length
        if len(video) == 0:
            raise AssertionError("Video is empty.")
        print("Number of frames in video: {}".format(len(video)))
        obs_size = video[0].shape
        width = np.uint(obs_size[-3])
        height = np.uint(obs_size[-2])
        channels = np.uint(obs_size[-1])
        print("HEIGHT IS: {}, WIDTH IS: {}, CHANNELS IS: {}".format(
            width, height, channels))
        fourcc = cv.VideoWriter_fourcc(*'XVID')
        out = cv.VideoWriter(
            os.path.join(self.policy_save_dir,
                         "trajectories_{}_epoch_{}.avi".format(mode, ext)),
            fourcc, self.FPS, (width, height))
        for i in range(len(video)):
            img_rgb = cv.cvtColor(np.uint8(255 * video[i][0]),
                                  cv.COLOR_BGR2RGB)  # Save as RGB image
            out.write(img_rgb)
        out.release()

    def plot_eval(self):
        xs = [i * self.eval_interval for i in range(len(self.eval_returns))]
        plt.plot(xs, self.eval_returns)
        plt.xlabel("Training epochs")
        plt.ylabel("Average Return")
        plt.title("Average Returns as a Function of Training")
        plt.savefig(os.path.join(self.policy_save_dir, "eval_returns.png"))
        print("CREATED PLOT OF RETURNS...")

    def save_policy(self, epochs_done=0):
        """
        Using the PolicySaver(s) defined in the trainer constructor, this
        function saves the training and evaluation policies according to the
        policy_save_dir attribute and whether multiple PPO agents or a single
        master PPO agent is used.

        Arguments:
            1. epochs_done (int):  The number of epochs completed in the
                                   training process at the time this save
                                   function is called.
        """

        # Save training policy
        train_save_dir = os.path.join(self.policy_save_dir, "train",
                                      "epochs_{}".format(epochs_done))
        if not os.path.exists(train_save_dir):
            os.makedirs(train_save_dir, exist_ok=True)
        self.train_saver.save(train_save_dir)

        print("Training policy saved...")

        # Save eval policy
        eval_save_dir = os.path.join(self.policy_save_dir, "eval",
                                     "epochs_{}".format(epochs_done))
        if not os.path.exists(eval_save_dir):
            os.makedirs(eval_save_dir, exist_ok=True)
        self.eval_saver.save(eval_save_dir)

        print("Eval policy saved...")

    def load_saved_policy(self, eval_model_path=None, train_model_path=None):

        # Load evaluation and/or training policies from path
        if eval_model_path is not None:
            self.eval_policy = tf.saved_model.load(eval_model_path)
            print("Loading evaluation policy from: {}".format(eval_model_path))

        if train_model_path is not None:
            self.collect_policy = tf.saved_model.load(train_model_path)
            print("Loading training policy from: {}".format(train_model_path))
Exemplo n.º 20
0
    def __init__(self,
                 ppo_agent,
                 train_env,
                 eval_env,
                 use_tensorboard=True,
                 add_training_to_video=True):

        # Environment attributes
        self.train_env = train_env  # Environment for training
        self.eval_env = eval_env  # Environment for testing

        # Agent attributes
        self.agent = ppo_agent  # An instance of a tf-agents agent
        self.actor_net = self.agent._actor_net
        self.value_net = self.agent._value_net
        self.eval_policy = self.agent.policy
        self.collect_policy = self.agent.collect_policy

        # Specifics of training
        self.max_buffer_size = 1000  # Collect entire memory buffer each time
        self.collect_steps_per_iteration = 1000  # Collect entire memory buffer each time
        self.epochs = 10000  # Total number of episodes
        self.total_steps = self.epochs * self.collect_steps_per_iteration
        print("Total steps: {}".format(self.total_steps))

        # Evaluation
        self.num_eval_episodes = 5  # How many episodes we evaluate each time
        self.eval_returns = []  # Keep track of evaluation performance
        self.eval_interval = 100  # Evaluate every <x> epochs
        self.max_eval_episode_steps = 1000  # Most steps we can have in an episode

        # Logging
        self.time_ext = datetime.now().strftime("%Y%m%d-%H%M%S")
        self.log_interval = 1
        self.policy_save_dir = os.path.join(
            os.getcwd(), "logging_{}/".format(self.time_ext))
        if not os.path.exists(self.policy_save_dir):
            print("Directory {} does not exist; creating it now".format(
                self.policy_save_dir))
            os.mkdir(self.policy_save_dir)
        self.video_train = []
        self.add_training_to_video = add_training_to_video
        self.video_eval = []

        # Tensorboard
        self.log_dir = "./tb_log_{}".format(
            self.time_ext)  # Log directory for tensorboard
        self.train_file_writer = tf.summary.create_file_writer(
            self.log_dir)  # File writer for tf
        if not os.path.exists(self.log_dir):
            os.mkdir(self.log_dir)
        self.use_tensorboard = use_tensorboard  # Boolean for whether or not we use tensorboard for plotting

        # Create a replay buffer
        self.replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
            self.agent.collect_data_spec,
            batch_size=self.train_env.batch_size,
            max_length=self.max_buffer_size)

        # Get train and evaluation policy savers
        self.train_saver = PolicySaver(self.collect_policy, batch_size=None)
        self.eval_saver = PolicySaver(self.eval_policy, batch_size=None)

        # Specify directories for training and evaluation policies
        self.policy_save_dir = os.path.join(os.getcwd(), "models",
                                            self.time_ext)
        self.save_interval = 500  # Save every 100 epochs
        if not os.path.exists(self.policy_save_dir):
            print("Directory {} does not exist;"
                  " creating it now".format(self.policy_save_dir))
            os.makedirs(self.policy_save_dir, exist_ok=True)
Exemplo n.º 21
0
                f.close()

    train_agent(n_iterations=n_iterations)

    # c) For storing frames
    def get_vid_frames(policy, filename, num_episodes=20, fps=2):
        frames = []
        for _ in range(num_episodes):
            time_step = tf_env.reset()
            frames.append(np.abs(env.get_board()))
            while not time_step.is_last():
                action_step = policy.action(time_step)
                time_step = tf_env.step(action_step.action)
                frames.append(np.abs(env.get_board()))
        return frames

    # Store Data
    df = pd.DataFrame(np.array(training_info).T,
                      columns=['N_Ep', 'Env_Steps', 'Avf_RM', 'Avg_EPLM'])
    df.to_csv('../DATA/Single/stats_{}.txt'.format(II), index=False, mode="a")

    # Store Frames
    frames = get_vid_frames(agent.policy, "trained-agent")
    with open('../DATA/Single/frames_{}.pkl'.format(II), 'wb') as f:
        pickle.dump(frames, f)

    # Store Model
    my_policy = agent.policy
    saver = PolicySaver(my_policy, batch_size=None)
    saver.save('../DATA/Single/policy_{}'.format(II))
Exemplo n.º 22
0
                                                    train_env.action_spec())

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=tf_agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=config.REPLAY_BUFFER_MAX_LENGTH)

    collect_data(train_env, random_policy, replay_buffer, steps=100)

    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3, 
        sample_batch_size=config.BATCH_SIZE, 
        num_steps=2).prefetch(3)
    
    my_policy = tf_agent.collect_policy
    saver = PolicySaver(my_policy, batch_size=None)

    iterator = iter(dataset)
    tf_agent.train = common.function(tf_agent.train)

    # Reset the train step
    tf_agent.train_step_counter.assign(0)

    # Evaluate the agent's policy once before training.
    avg_return = compute_avg_return(eval_env, tf_agent.policy, \
                                    config.NUM_EVAL_EPISODES)
    returns = [avg_return]
    iterations=[0]
    for _ in tqdm(range(config.NUM_ITERATIONS),total=config.NUM_ITERATIONS):
            # Collect a few steps using collect_policy and save to the replay buffer.
            for _ in range(config.COLLECT_STEPS_PER_ITERATION):