示例#1
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    print(f"Your training in the {ENV_NAME} environment.\n")
    env = get_env_from_name(ENV_NAME, ENV_SEED)
    test_env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim)

    # Load model if retraining is selected
    if TRAIN_PARAMS["continue_training"]:

        # Create retrain path
        retrain_model_folder = TRAIN_PARAMS["continue_model_folder"]
        retrain_model_path = os.path.abspath(
            os.path.join(log_dir,
                         "../../" + TRAIN_PARAMS["continue_model_folder"]))

        # Check if retrain model exists if not throw error
        if not os.path.exists(retrain_model_path):
            print(
                "Shutting down training since the model you specified in the "
                f"`continue_model_folder` `{retrain_model_folder}` "
                f"argument was not found for the `{ENV_NAME}` environment.")
            sys.exit(0)

        # Load retrain model
        print(f"Restoring model `{retrain_model_path}`")
        result = policy.restore(os.path.abspath(retrain_model_path +
                                                "/policy"))
        if not result:
            print(
                "Shuting down training as something went wrong while loading "
                f"model `{retrain_model_folder}`.")
            sys.exit(0)

        # Create new storage folder
        log_dir_split = log_dir.split("/")
        log_dir_split[-2] = (
            "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) +
            "_finetune"
            # + "_retrained_"
            # + log_dir_split[-2]
        )
        log_dir = "/".join(log_dir_split)

        # Reset lagrance multipliers if requested
        if ALG_PARAMS["reset_lagrance_multipliers"]:
            policy.sess.run(
                policy.log_alpha.assign(tf.math.log(ALG_PARAMS["alpha"])))
            policy.sess.run(
                policy.log_labda.assign(tf.math.log(ALG_PARAMS["labda"])))
    else:
        print(f"Train new model `{log_dir}`")

    # Print logging folder
    print(f"Logging results to `{log_dir}`.")

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "lyapunov_error": [],
            "alpha": [],
            "lambda": [],
            "entropy": [],
            "a_loss": [],
        }

        # Break out of loop if global steps have been reached
        if global_step > ENV_PARAMS["max_global_steps"]:

            # Print step count, save model and stop the program
            print(f"Training stopped after {global_step} steps.")
            print("Running time: ", time.time() - t1)
            print("Saving Model")
            policy.save_result(log_dir)
            print("Running time: ", time.time() - t1)
            return

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Save intermediate checkpoints if requested
            if TRAIN_PARAMS["save_checkpoints"]:
                if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0
                        and global_step != 0):

                    # Create intermediate result checkpoint folder
                    checkpoint_save_path = os.path.abspath(
                        os.path.join(log_dir, "checkpoints", "step_" + str(j)))
                    os.makedirs(checkpoint_save_path, exist_ok=True)

                    # Save intermediate checkpoint
                    policy.save_result(checkpoint_save_path)

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch)

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            test_env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                # Decay learning rates
                frac = 1.0 - (global_step -
                              1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim, log_dir=log_dir)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    tb_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Create tensorboard variables
    tb_lr_a = tf.Variable(lr_a, dtype=tf.float32)
    tb_lr_l = tf.Variable(lr_l, dtype=tf.float32)
    tb_lr_lag = tf.Variable(lr_a, dtype=tf.float32)
    tb_ret = tf.Variable(0, dtype=tf.float32)
    tb_len = tf.Variable(0, dtype=tf.float32)
    tb_a_loss = tf.Variable(0, dtype=tf.float32)
    tb_lyapunov_error = tf.Variable(0, dtype=tf.float32)
    tb_entropy = tf.Variable(0, dtype=tf.float32)

    # Initialize tensorboard variables and create summaries
    if USE_TB:
        policy.sess.run(
            [
                tb_lr_a.initializer,
                tb_lr_l.initializer,
                tb_lr_lag.initializer,
                tb_ret.initializer,
                tb_len.initializer,
                tb_a_loss.initializer,
                tb_lyapunov_error.initializer,
                tb_entropy.initializer,
            ]
        )

        # Add tensorboard summaries
        main_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("lr_a", tb_lr_a),
                tf.compat.v1.summary.scalar("lr_l", tb_lr_l),
                tf.compat.v1.summary.scalar("lr_lag", tb_lr_lag),
                tf.compat.v1.summary.scalar("alpha", policy.alpha),
                tf.compat.v1.summary.scalar("lambda", policy.labda),
            ]
        )
        other_sum = tf.compat.v1.summary.merge(
            [
                tf.compat.v1.summary.scalar("ep_ret", tb_ret),
                tf.compat.v1.summary.scalar("ep_length", tb_len),
                tf.compat.v1.summary.scalar("a_loss", tb_a_loss),
                tf.compat.v1.summary.scalar("lyapunov_error", tb_lyapunov_error),
                tf.compat.v1.summary.scalar("entropy", tb_entropy),
            ]
        )
        policy.tb_writer.add_summary(
            policy.sess.run(main_sum), policy.sess.run(policy.step)
        )
        if WRITE_W_B:
            policy.tb_writer.add_summary(
                policy.sess.run(policy.w_b_sum), policy.sess.run(policy.step),
            )
        policy.tb_writer.flush()  # Above summaries are known from the start

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2)))  # DEBUG
            action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Increment tensorboard step counter
            # NOTE: This was done differently from the global_step counter since
            # otherwise there were inconsistencies in the tb log.
            if USE_TB:
                tb_step += 1

            # Optimize weights and parameters using STG
            if (
                pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                and global_step % ALG_PARAMS["steps_per_cycle"] == 0
            ):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch
                    )

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (
                training_started
                and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                and global_step > 0
            ):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":", str(eval_diagnostics[key]), "|"]
                            )
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend(
                            [key, ":", str(round(training_diagnostics[key], 2)), "|"]
                        )
                        for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                    # Get current model performance for tb
                    if USE_TB:
                        training_diagnostics = evaluate_training_rollouts(
                            last_training_paths
                        )

                # Log tb variables
                if USE_TB:
                    if i % TB_FREQ == 0:

                        # Update and log learning rate tb vars
                        policy.sess.run(policy.step.assign(tb_step))
                        policy.sess.run(tb_lr_a.assign(lr_a_now))
                        policy.sess.run(tb_lr_l.assign(lr_l_now))
                        policy.sess.run(tb_lr_lag.assign(lr_a))
                        policy.tb_writer.add_summary(
                            policy.sess.run(main_sum), policy.sess.run(policy.step)
                        )

                        # Update and log other training vars to tensorboard
                        if training_started:

                            # Update and log training vars
                            policy.sess.run(
                                tb_ret.assign(training_diagnostics["return"])
                            )
                            policy.sess.run(
                                tb_len.assign(training_diagnostics["length"])
                            )
                            policy.sess.run(
                                tb_a_loss.assign(training_diagnostics["a_loss"])
                            )
                            policy.sess.run(
                                tb_lyapunov_error.assign(
                                    training_diagnostics["lyapunov_error"]
                                )
                            )
                            policy.sess.run(
                                tb_entropy.assign(training_diagnostics["entropy"])
                            )
                            policy.tb_writer.add_summary(
                                policy.sess.run(other_sum), policy.sess.run(policy.step)
                            )

                            # Log network weights
                            if WRITE_W_B:
                                policy.tb_writer.add_summary(
                                    policy.sess.run(policy.w_b_sum),
                                    policy.sess.run(policy.step),
                                )
                        policy.tb_writer.flush()

                # Decay learning rates
                frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    # policy.tb_writer.close()
    print("Running time: ", time.time() - t1)
    return
示例#3
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim, log_dir=log_dir)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Log initial values to tensorboard
    if DEBUG_PARAMS["use_tb"]:

        # Trace learn method (Used for debugging)
        if DEBUG_PARAMS["debug"]:
            if DEBUG_PARAMS["trace_net"]:

                # Create dummy input
                batch = {
                    "s": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)),
                    "a": tf.random.uniform((ALG_PARAMS["batch_size"], policy.a_dim)),
                    "r": tf.random.uniform((ALG_PARAMS["batch_size"], 1)),
                    "terminal": tf.zeros((ALG_PARAMS["batch_size"], 1)),
                    "s_": tf.random.uniform((ALG_PARAMS["batch_size"], policy.s_dim)),
                }

                # Trace learn method and log to tensorboard
                tf.summary.trace_on(graph=True, profiler=True)
                policy.learn(lr_a_now, lr_l_now, lr_a, batch)
                with policy.tb_writer.as_default():
                    tf.summary.trace_export(
                        name="learn", step=0, profiler_outdir=log_dir
                    )

            # Shut down as we are in debug mode
            if DEBUG_PARAMS["trace_net"] or DEBUG_PARAMS["trace_learn"]:
                print(
                    "Shutting down training as a trace was requested in debug mode. "
                    "This was done since during the trace a backward pass was performed "
                    "on dummy data. Please disable the trace to continue training "
                    "while being in debug mode."
                )
                sys.exit(0)

        # Log initial values
        with policy.tb_writer.as_default():
            tf.summary.scalar("lr_a", lr_a_now, step=0)
            tf.summary.scalar("lr_l", lr_l_now, step=0)
            tf.summary.scalar("lr_lag", lr_a, step=0)
            tf.summary.scalar("alpha", policy.alpha, step=0)
            tf.summary.scalar("lambda", policy.labda, step=0)

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            # a = np.squeeze(np.random.uniform(low=-1.0, high=1.0, size=(1, 2)))  # DEBUG
            action = a_lowerbound + (a + 1.0) * (a_upperbound - a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Increment tensorboard step counter
            # NOTE: This was done differently from the global_step counter since
            # otherwise there were inconsistencies in the tb log.
            if DEBUG_PARAMS["use_tb"]:
                policy.step += 1

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (
                pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                and global_step % ALG_PARAMS["steps_per_cycle"] == 0
            ):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch
                    )

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (
                training_started
                and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                and global_step > 0
            ):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":", str(eval_diagnostics[key]), "|"]
                            )
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend(
                            [key, ":", str(round(training_diagnostics[key], 2)), "|"]
                        )
                        for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                    # Get current model performance for tb
                    if DEBUG_PARAMS["use_tb"]:
                        training_diagnostics = evaluate_training_rollouts(
                            last_training_paths
                        )

                # Log tb variables
                if DEBUG_PARAMS["use_tb"]:
                    if i % DEBUG_PARAMS["tb_freq"] == 0:

                        # Log learning rate to tb
                        with policy.tb_writer.as_default():
                            tf.summary.scalar("lr_a", lr_a_now, step=policy.step)
                            tf.summary.scalar("lr_l", lr_l_now, step=policy.step)
                            tf.summary.scalar("lr_lag", lr_a, step=policy.step)
                            tf.summary.scalar("alpha", policy.alpha, step=policy.step)
                            tf.summary.scalar("lambda", policy.labda, step=policy.step)

                        # Update and log other training vars to tensorboard
                        if training_started:
                            with policy.tb_writer.as_default():
                                tf.summary.scalar(
                                    "ep_ret",
                                    training_diagnostics["return"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "ep_length",
                                    training_diagnostics["length"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "a_loss",
                                    training_diagnostics["a_loss"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "lyapunov_error",
                                    training_diagnostics["lyapunov_error"],
                                    step=policy.step,
                                )
                                tf.summary.scalar(
                                    "entropy",
                                    training_diagnostics["entropy"],
                                    step=policy.step,
                                )

                            # Log network weights
                            if DEBUG_PARAMS["write_w_b"]:
                                with policy.tb_writer.as_default():

                                    # GaussianActor weights/biases
                                    tf.summary.histogram(
                                        "Ga/l1/weights",
                                        policy.ga.net_0.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l1/bias",
                                        policy.ga.net_0.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l2/weights",
                                        policy.ga.net_1.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/l2/bias",
                                        policy.ga.net_1.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/mu/weights",
                                        policy.ga.mu.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/mu/bias",
                                        policy.ga.mu.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/log_sigma/weights",
                                        policy.ga.log_sigma.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga/log_sigma/bias",
                                        policy.ga.log_sigma.weights[1],
                                        step=policy.step,
                                    )

                                    # Target GaussianActor weights/biases
                                    tf.summary.histogram(
                                        "Ga_/l1/weights",
                                        policy.ga_.net_0.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l1/bias",
                                        policy.ga_.net_0.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l2/weights",
                                        policy.ga_.net_1.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/l2/bias",
                                        policy.ga_.net_1.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/mu/weights",
                                        policy.ga_.mu.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/mu/bias",
                                        policy.ga_.mu.weights[1],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/log_sigma/weights",
                                        policy.ga_.log_sigma.weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Ga_/log_sigma/bias",
                                        policy.ga_.log_sigma.weights[1],
                                        step=policy.step,
                                    )

                                    # Lyapunov critic weights/biases
                                    tf.summary.histogram(
                                        "Lc/w1_s", policy.lc.w1_s, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/w1_a", policy.lc.w1_a, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/b1", policy.lc.b1, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/net/l2/weights",
                                        policy.lc.net.layers[0].weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc/net/l2/bias",
                                        policy.lc.net.layers[0].weights[1],
                                        step=policy.step,
                                    )

                                    # Target Lyapunov critic weights/biases
                                    tf.summary.histogram(
                                        "Lc_/w1_s", policy.lc_.w1_s, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/w1_a", policy.lc_.w1_a, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/b1", policy.lc_.b1, step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/net/l2/weights",
                                        policy.lc_.net.layers[0].weights[0],
                                        step=policy.step,
                                    )
                                    tf.summary.histogram(
                                        "Lc_/net/l2/bias",
                                        policy.lc_.net.layers[0].weights[1],
                                        step=policy.step,
                                    )

                # Decay learning rates
                frac = 1.0 - (global_step - 1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    print("Running time: ", time.time() - t1)
    return
def train(variant):
    env_name = variant['env_name']
    env = get_env_from_name(env_name)
    if variant['evaluate'] is True:
        evaluation_env = get_env_from_name(env_name)
    else:
        evaluation_env = None
    env_params = variant['env_params']
    judge_safety_func = get_safety_constraint_func(variant)

    max_episodes = env_params['max_episodes']
    max_ep_steps = env_params['max_ep_steps']
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']
    num_of_paths = variant['num_of_paths']

    alg_name = variant['algorithm_name']
    policy_build_fn = get_policy(alg_name)
    policy_params = variant['alg_params']
    min_memory_size = policy_params['min_memory_size']
    noise_scale = policy_params['noise']
    noise_scale_now = noise_scale

    lr_a, lr_c, lr_l = policy_params['lr_a'], policy_params[
        'lr_c'], policy_params['lr_l']
    lr_a_now = lr_a  # learning rate for actor
    lr_c_now = lr_c  # learning rate for critic
    lr_l_now = lr_l  # learning rate for critic

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])

    logger.logkv('tau', policy_params['tau'])
    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low
    policy = policy_build_fn(a_dim, s_dim, policy_params)
    logger.logkv('target_entropy', policy.target_entropy)
    # For analyse

    Render = env_params['eval_render']
    ewma_p = 0.95
    ewma_step = np.zeros((1, max_episodes + 1))
    ewma_reward = np.zeros((1, max_episodes + 1))

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False
    for i in range(max_episodes):

        ep_reward = 0
        l_r = 0

        current_path = {
            'rewards': [],
            'l_rewards': [],
            'l_error': [],
            'critic1_error': [],
            'critic2_error': [],
            'alpha': [],
            'lambda': [],
            'entropy': [],
            'a_loss': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()
        for j in range(max_ep_steps):
            if Render:
                env.render()
            a = policy.choose_action(s, noise)
            a = np.clip(a, -np.ones(a_dim), np.ones(a_dim))
            action = a_lowerbound + (a + 1.) * (a_upperbound -
                                                a_lowerbound) / 2

            # Run in simulator
            s_, r, done, info = env.step(action)
            l_r = info['l_rewards']
            if j == max_ep_steps - 1:
                done = True
            terminal = 1. if done else 0.

            # 储存s,a和s_next,reward用于DDPG的学习
            policy.store_transition(s, a, r, l_r, terminal, s_)

            # 如果状态接近边缘 就存储到边缘memory里
            # if policy.use_lyapunov is True and np.abs(s[0]) > env.cons_pos:  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
            if policy.use_lyapunov is True and judge_safety_func(
                    s_, r, done,
                    info):  # or np.abs(s[2]) > env.theta_threshold_radians*0.8
                policy.store_edge_transition(s, a, r, l_r, terminal, s_)

            # Learn
            if policy.use_lyapunov is True:
                if policy.pointer > min_memory_size and policy.cons_pointer > 0:
                    # Decay the action randomness
                    training_started = True
                    labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now)
                    global_step += 1

            else:
                if policy.pointer > min_memory_size:
                    # Decay the action randomness
                    training_started = True
                    labda, alpha, c1_loss, c2_loss, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_c_now, lr_l_now)
                    global_step += 1

            if training_started:
                current_path['rewards'].append(r)
                current_path['l_rewards'].append(l_r)
                current_path['l_error'].append(l_loss)
                current_path['critic1_error'].append(c1_loss)
                current_path['critic2_error'].append(c2_loss)
                current_path['alpha'].append(alpha)
                current_path['lambda'].append(labda)
                current_path['entropy'].append(entropy)
                current_path['a_loss'].append(a_loss)
            # if global_step>204800:
            #     Render=True

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:
                if evaluation_env is not None:
                    rollouts = get_evaluation_rollouts(policy,
                                                       evaluation_env,
                                                       num_of_paths,
                                                       max_ep_steps,
                                                       render=Render)
                    diagnotic = evaluate_rollouts(rollouts)
                    # [diagnotics[key].append(diagnotic[key]) for key in diagnotic.keys()]
                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        diagnotic['return-average'],
                        'average length:',
                        diagnotic['episode-length-avg'],
                    )

                    logger.logkv('eval_eprewmean', diagnotic['return-average'])
                    logger.logkv('eval_eprewmin', diagnotic['return-min'])
                    logger.logkv('eval_eprewmax', diagnotic['return-max'])
                    logger.logkv('eval_eplrewmean',
                                 diagnotic['lreturn-average'])
                    logger.logkv('eval_eplrewmin', diagnotic['lreturn-min'])
                    logger.logkv('eval_eplrewmax', diagnotic['lreturn-max'])
                    logger.logkv('eval_eplenmean',
                                 diagnotic['episode-length-avg'])
                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnotic is not None:
                    # [training_diagnotics[key].append(training_diagnotic[key]) for key in training_diagnotic.keys()]\
                    logger.logkv('eprewmean',
                                 training_diagnotic['train-return-average'])
                    logger.logkv('eplrewmean',
                                 training_diagnotic['train-lreturn-average'])
                    logger.logkv(
                        'eplenmean',
                        training_diagnotic['train-episode-length-avg'])
                    logger.logkv('lyapunov_lambda',
                                 training_diagnotic['train-lambda-avg'])

                    logger.logkv('entropy',
                                 training_diagnotic['train-entropy-avg'])
                    logger.logkv('critic1 error',
                                 training_diagnotic['train-critic1-error-avg'])
                    logger.logkv('critic2 error',
                                 training_diagnotic['train-critic2-error-avg'])
                    logger.logkv(
                        'lyapunov error',
                        training_diagnotic['train-lyapunov-error-avg'])
                    logger.logkv('policy_loss',
                                 training_diagnotic['train-a-loss-avg'])
                    logger.logkv('noise_scale', noise_scale_now)
                    logger.logkv('lr_a', lr_a_now)
                    logger.logkv('lr_c', lr_c_now)
                    logger.logkv('lr_l', lr_l_now)
                    print(
                        'training_step:',
                        global_step,
                        'average reward:',
                        round(training_diagnotic['train-return-average'], 2),
                        'average lreward:',
                        round(training_diagnotic['train-lreturn-average'], 2),
                        'average length:',
                        round(training_diagnotic['train-episode-length-avg'],
                              1),
                        'lyapunov error:',
                        round(training_diagnotic['train-lyapunov-error-avg'],
                              6),
                        'critic1 error:',
                        round(training_diagnotic['train-critic1-error-avg'],
                              6),
                        'critic2 error:',
                        round(training_diagnotic['train-critic2-error-avg'],
                              6),
                        'policy_loss:',
                        round(training_diagnotic['train-a-loss-avg'], 6),
                        'alpha:',
                        round(training_diagnotic['train-alpha-avg'], 6),
                        'lambda:',
                        round(training_diagnotic['train-lambda-avg'], 6),
                        'entropy:',
                        round(training_diagnotic['train-entropy-avg'], 6),
                        'noise_scale',
                        round(noise_scale_now, 6),
                    )
                logger.dumpkvs()
            # 状态更新
            s = s_
            ep_reward += r

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                ewma_step[0,
                          i + 1] = ewma_p * ewma_step[0, i] + (1 - ewma_p) * j
                ewma_reward[
                    0, i +
                    1] = ewma_p * ewma_reward[0, i] + (1 - ewma_p) * ep_reward
                frac = 1.0 - (global_step - 1.0) / max_global_steps
                lr_a_now = lr_a * frac  # learning rate for actor
                lr_c_now = lr_c * frac  # learning rate for critic
                lr_l_now = lr_l * frac  # learning rate for critic
                noise_scale_now = noise_scale * frac

                break

    print('Running time: ', time.time() - t1)
    return
示例#5
0
def train(log_dir):
    """Performs the agent traning.

    Args:
        log_dir (str): The directory in which the final model (policy) and the
        log data is saved.
    """

    # Create environment
    env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for lyapunov critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    # Create the Lyapunov Actor Critic agent
    policy = LAC(a_dim, s_dim)

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    # Training loop
    for i in range(ENV_PARAMS["max_episodes"]):

        # Create variable to store information about the current path
        current_path = {
            "rewards": [],
            "a_loss": [],
            "alpha": [],
            "lambda": [],
            "lyapunov_error": [],
            "entropy": [],
        }

        # Stop training if max number of steps has been reached
        if global_step > ENV_PARAMS["max_global_steps"]:
            break

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for j in range(ENV_PARAMS["max_ep_steps"]):

            # Render environment if requested
            if ENV_PARAMS["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            a = policy.choose_action(s)
            action = a_lowerbound + (a + 1.0) * (a_upperbound -
                                                 a_lowerbound) / 2

            # Perform action in env
            s_, r, done, _ = env.step(action)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if j == ENV_PARAMS["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize weights and parameters using STG
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    labda, alpha, l_loss, entropy, a_loss = policy.learn(
                        lr_a_now, lr_l_now, lr_a, batch)

            # Save path results
            if training_started:
                current_path["rewards"].append(r)
                current_path["lyapunov_error"].append(l_loss)
                current_path["alpha"].append(alpha)
                current_path["lambda"].append(labda)
                current_path["entropy"].append(entropy)
                current_path["a_loss"].append(a_loss)

            # Evalute the current performance and log results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    logger.logkv("lr_l", lr_l_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    print("".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Decay learning rate
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                frac = 1.0 - (global_step -
                              1.0) / ENV_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for lyapunov critic
                break

    # Save model and print Running time
    policy.save_result(log_dir)
    print("Running time: ", time.time() - t1)
    return
示例#6
0
def train(variant):
    Min_cost = 1000000

    data_trajectories = get_data()  # get data (X, W, X_, theta, state)
    env_name = variant['env_name']  # choose your environment
    env = get_env_from_name(env_name)

    env_params = variant['env_params']

    max_episodes = env_params[
        'max_episodes']  # maximum episodes for RL training
    max_ep_steps = env_params[
        'max_ep_steps']  # number of maximum steps in each episode
    max_global_steps = env_params['max_global_steps']
    store_last_n_paths = variant['store_last_n_paths']
    evaluation_frequency = variant['evaluation_frequency']

    policy_params = variant['alg_params']

    min_memory_size = policy_params['min_memory_size']
    steps_per_cycle = policy_params['steps_per_cycle']
    train_per_cycle = policy_params['train_per_cycle']
    batch_size = policy_params['batch_size']

    s_dim = env.observation_space.shape[
        0]  # dimension of state (3 for Battery)

    a_dim = env.action_space.shape[0]  # action space dimension (1 or 2)
    a_upperbound = env.action_space.high
    a_lowerbound = env.action_space.low

    agent = CAC(a_dim, s_dim, policy_params, max_global_steps=max_global_steps)
    # policy.restore(variant['log_path'] + "/0/policy")

    pool_params = {
        's_dim': s_dim,
        'a_dim': a_dim,
        'd_dim': 1,
        'store_last_n_paths': store_last_n_paths,
        'memory_capacity': policy_params['memory_capacity'],
        'min_memory_size': policy_params['min_memory_size'],
        'history_horizon': policy_params['history_horizon'],
        'finite_horizon': policy_params['finite_horizon']
    }
    if 'value_horizon' in policy_params.keys():
        pool_params.update({'value_horizon': policy_params['value_horizon']})
    else:
        pool_params['value_horizon'] = None
    pool = Pool(pool_params)

    # For analyse
    Render = env_params['eval_render']

    # Training setting
    t1 = time.time()
    global_step = 0
    last_training_paths = deque(maxlen=store_last_n_paths)
    training_started = False

    log_path = variant['log_path']
    logger.configure(dir=log_path, format_strs=['csv'])
    logger.logkv('tau', policy_params['tau'])

    logger.logkv('alpha3', policy_params['alpha3'])
    logger.logkv('batch_size', policy_params['batch_size'])
    logger.logkv('target_entropy', agent.target_entropy)

    for i in range(max_episodes):
        print("episode # ", i)
        print("global steps ", global_step)

        current_path = {
            'rewards': [],
            'distance': [],
            'a_loss': [],
            'alpha': [],
            'labda': [],
            'beta': [],
            'lyapunov_error': [],
            'entropy': [],
            'action_distance': [],
        }

        if global_step > max_global_steps:
            break

        s = env.reset()

        # Random start point

        # traj_id = np.random.randint(0, len(data_trajectories))
        traj_id = np.random.randint(0, variant['num_data_trajectories'])
        # traj_id = 0
        traj = data_trajectories[traj_id]
        # print(len(traj))
        start_point = np.random.randint(0, len(traj))
        # start_point = 0
        s = traj[start_point, 1]

        # current state, theta,next w, desired state
        # this is for decision making
        # 16,1,4,16
        s = np.array([s, traj[start_point, 2], traj[start_point, 4]])
        # print(i, s)

        env.state = s
        env.model.state = traj[start_point, -8:]

        ep_steps = min(start_point + 1 + max_ep_steps, len(traj))
        for j in range(start_point + 1, ep_steps):
            if Render:
                env.render()
            delta = np.zeros(s.shape)
            # ###### NOSIE ##############

            # noise = np.random.normal(0, 0.01, 0.01)
            # delta[2:]= noise
            # ########IF Noise env##########
            # s= s + delta
            # a = policy.choose_action(s)

            # ###### BIAS ##############

            # noise = s[0:16]*0.01
            # delta[0:16] = noise

            a = agent.act(torch.tensor([s]).float())

            action = a_lowerbound + (a.detach().numpy() +
                                     1.) * (a_upperbound - a_lowerbound) / 2
            # action = traj[j-1,16]

            a_upperbound = env.action_space.high
            a_lowerbound = env.action_space.low

            # Run in simulator
            _, r, done, X_ = env.step(action)
            # The new s= current state,next omega, next state
            s_ = np.array([X_[1][0], traj[j, 2], traj[j, 4]])

            r = modify_reward(r, s, s_, variant['reward_id'])

            env.state = s_

            # theta_pre=theta
            if training_started:
                global_step += 1
                # agent.scheduler_step()

            if j == max_ep_steps - 1 + start_point:
                done = True

            terminal = 1. if done else 0.

            if j > start_point + 2:
                pool.store(s,
                           a.detach().numpy().flatten(), np.zeros([1]),
                           np.zeros([1]), r, terminal, s_, _s)

            if pool.memory_pointer > min_memory_size and global_step % steps_per_cycle == 0:
                training_started = True

                for _ in range(train_per_cycle):
                    batch = pool.sample(batch_size)
                    alpha_loss, beta_loss, labda_loss, actor_loss, lyapunov_loss = agent.learn(
                        batch)
                    if global_step % 200 == 0:
                        print("labda = ", agent.labda.item(), " | alpha = ",
                              agent.alpha.item(), " | l_loss = ",
                              lyapunov_loss.item(), " | constraint loss : ",
                              agent.lyapunov_loss.item(), " | entropy = ",
                              agent.log_pis.mean().item(), " | a_loss = ",
                              actor_loss.item(), " | alpha_loss = ",
                              alpha_loss.item(), " | labda_loss = ",
                              labda_loss.item(), " | lr_a = ", agent.LR_A,
                              " | lr_l = ", agent.LR_L, " | lr_labda = ",
                              agent.LR_lag, " | log alpha grad = ",
                              agent.log_alpha.grad.item(),
                              " | log labda grad = ",
                              agent.log_labda.grad.item(), " | predicted_l : ",
                              agent.l.mean().item(), " | predicted_l_ : ",
                              agent.l_.mean().item())
            if training_started:
                current_path['rewards'].append(r)
                current_path['lyapunov_error'].append(
                    lyapunov_loss.detach().numpy())
                current_path['alpha'].append(agent.alpha.detach().numpy())
                current_path['entropy'].append(
                    agent.log_pis.mean().detach().cpu().numpy())
                current_path['a_loss'].append(actor_loss.detach().numpy())
                current_path['beta'].append(agent.beta.detach().numpy())
                # current_path['action_distance'].append(action_distance)

            if training_started and global_step % evaluation_frequency == 0 and global_step > 0:

                logger.logkv("total_timesteps", global_step)

                training_diagnotic = evaluate_training_rollouts(
                    last_training_paths)
                # print(training_diagnotic)
                if training_diagnotic is not None:
                    print("doing training evaluation")
                    eval_diagnotic = training_evaluation(variant, env, agent)
                    [
                        logger.logkv(key, eval_diagnotic[key])
                        for key in eval_diagnotic.keys()
                    ]
                    training_diagnotic.pop('return')
                    [
                        logger.logkv(key, training_diagnotic[key])
                        for key in training_diagnotic.keys()
                    ]
                    logger.logkv('lr_actor_alpha', agent.LR_A)
                    logger.logkv('lr_lyapunov', agent.LR_L)
                    logger.logkv('lr_labda', agent.LR_lag)
                    string_to_print = ['time_step:', str(global_step), '|']
                    [
                        string_to_print.extend(
                            [key, ':', str(eval_diagnotic[key]), '|'])
                        for key in eval_diagnotic.keys()
                    ]
                    [
                        string_to_print.extend([
                            key, ':',
                            str(round(training_diagnotic[key], 2)), '|'
                        ]) for key in training_diagnotic.keys()
                    ]
                    print(''.join(string_to_print))

                logger.dumpkvs()
                if eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length'] <= Min_cost:
                    Min_cost = eval_diagnotic['test_return'] / eval_diagnotic[
                        'test_average_length']
                    print("New lowest cost:", Min_cost)
                    agent.save_result(log_path)
                else:
                    print("cost did not improve.")
                    print("The best cost is ", Min_cost)
                    print(
                        "avg cost was ", eval_diagnotic['test_return'] /
                        eval_diagnotic['test_average_length'])
                if training_started and global_step % (
                        10 * evaluation_frequency) == 0 and global_step > 0:
                    agent.save_result(log_path)

            # State Update
            _s = s
            s = s_

            # OUTPUT TRAINING INFORMATION AND LEARNING RATE DECAY
            if done:
                if training_started:
                    last_training_paths.appendleft(current_path)
                break
    agent.save_result(log_path)

    print('Running time: ', time.time() - t1)
    return
示例#7
0
def train(log_dir):
    """Performs the agent training.

    Args:
        log_dir (str): The directory in which the final model (policy) and the log data
            is saved.
    """

    # Create train and test environments
    print(
        colorize(
            f"INFO: You are training in the {ENV_NAME} environment.",
            "cyan",
            bold=True,
        ))
    env = get_env_from_name(ENV_NAME, ENV_SEED)
    test_env = get_env_from_name(ENV_NAME, ENV_SEED)

    # Set initial learning rates
    lr_a, lr_l, lr_c = (
        ALG_PARAMS["lr_a"],
        ALG_PARAMS["lr_l"],
        ALG_PARAMS["lr_c"],
    )
    lr_a_now = ALG_PARAMS["lr_a"]  # learning rate for actor, lambda and alpha
    lr_l_now = ALG_PARAMS["lr_l"]  # learning rate for Lyapunov critic
    lr_c_now = ALG_PARAMS["lr_c"]  # learning rate for q critic

    # Get observation and action space dimension and limits from the environment
    s_dim = env.observation_space.shape[0]
    a_dim = env.action_space.shape[0]
    a_lowerbound = env.action_space.low
    a_upperbound = env.action_space.high

    # Create the Agent
    policy = LAC(a_dim,
                 s_dim,
                 act_limits={
                     "low": a_lowerbound,
                     "high": a_upperbound
                 })

    # Load model if retraining is selected
    if TRAIN_PARAMS["continue_training"]:

        # Create retrain model path
        retrain_model_folder = TRAIN_PARAMS["continue_model_folder"]
        retrain_model_path = osp.abspath(
            osp.join(log_dir, "../..", TRAIN_PARAMS["continue_model_folder"]))

        # Check if retrain model exists if not throw error
        if not osp.exists(retrain_model_path):
            print(
                colorize(
                    ("ERROR: Shutting down training since the model you specified "
                     f"in the `continue_model_folder` `{retrain_model_folder}` "
                     f"argument was not found for the `{ENV_NAME}` environment."
                     ),
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Load old model
        print(
            colorize(f"INFO: Restoring model `{retrain_model_path}`.",
                     "cyan",
                     bold=True))
        result = policy.restore(
            osp.abspath(osp.join(retrain_model_path, "policy")),
            restore_lagrance_multipliers=(
                not ALG_PARAMS["reset_lagrance_multipliers"]),
        )
        if not result:
            print(
                colorize(
                    "ERROR: Shuting down training as something went wrong while "
                    "loading "
                    f"model `{retrain_model_folder}`.",
                    "red",
                    bold=True,
                ))
            sys.exit(0)

        # Create new storage folder
        log_dir_split = log_dir.split("/")
        log_dir_split[-2] = (
            "_".join(TRAIN_PARAMS["continue_model_folder"].split("/")) +
            "_finetune")
        log_dir = "/".join(log_dir_split)
    else:
        print(colorize(f"INFO: Train new model `{log_dir}`", "cyan",
                       bold=True))

    # Print logging folder path
    print(colorize(f"INFO: Logging results to `{log_dir}`.", "cyan",
                   bold=True))

    # Create replay memory buffer
    pool = Pool(
        s_dim=s_dim,
        a_dim=a_dim,
        store_last_n_paths=TRAIN_PARAMS["num_of_training_paths"],
        memory_capacity=ALG_PARAMS["memory_capacity"],
        min_memory_size=ALG_PARAMS["min_memory_size"],
    )

    # Setup logger and log hyperparameters
    logger.configure(dir=log_dir, format_strs=["csv"])
    logger.logkv("tau", ALG_PARAMS["tau"])
    logger.logkv("alpha3", ALG_PARAMS["alpha3"])
    logger.logkv("batch_size", ALG_PARAMS["batch_size"])
    logger.logkv("target_entropy", policy.target_entropy)

    ####################################################
    # Training loop ####################################
    ####################################################

    # Setup training loop parameters
    t1 = time.time()
    global_step = 0
    global_episodes = 0
    last_training_paths = deque(maxlen=TRAIN_PARAMS["num_of_training_paths"])
    training_started = False

    # Train the agent in the environment until max_episodes has been reached
    print(colorize("INFO: Training...\n", "cyan", bold=True))
    while 1:  # Keep running episodes until global step has been reached

        # Create variable to store information about the current path
        if policy.use_lyapunov:
            current_path = {
                "rewards": [],
                "lyapunov_error": [],
                "alpha": [],
                "lambda": [],
                "entropy": [],
                "a_loss": [],
                "alpha_loss": [],
                "lambda_loss": [],
            }
        else:
            current_path = {
                "rewards": [],
                "critic_error": [],
                "alpha": [],
                "entropy": [],
                "a_loss": [],
                "alpha_loss": [],
            }

        # Reset environment
        s = env.reset()

        # Training Episode loop
        for jj in range(ENVS_PARAMS[ENV_NAME]["max_ep_steps"]):

            # Break out of loop if global steps have been reached
            if global_step >= TRAIN_PARAMS["max_global_steps"]:

                # Print step count, save model and stop the program
                print(
                    colorize(
                        f"\nINFO: Training stopped after {global_step} steps.",
                        "cyan",
                        bold=True,
                    ))
                print(
                    colorize(
                        "INFO: Running time: {}".format(time.time() - t1),
                        "cyan",
                        bold=True,
                    ))
                print(colorize("INFO: Saving Model", "cyan", bold=True))
                policy.save_result(log_dir)
                return

            # Save intermediate checkpoints if requested
            if TRAIN_PARAMS["save_checkpoints"]:
                if (global_step % TRAIN_PARAMS["checkpoint_save_freq"] == 0
                        and global_step != 0):

                    # Create intermediate result checkpoint folder
                    checkpoint_save_path = osp.abspath(
                        osp.join(log_dir, "checkpoints", "step_" + str(jj)))
                    os.makedirs(checkpoint_save_path, exist_ok=True)

                    # Save intermediate checkpoint
                    policy.save_result(checkpoint_save_path)

            # Render environment if requested
            if ENVS_PARAMS[ENV_NAME]["eval_render"]:
                env.render()

            # Retrieve (scaled) action based on the current policy
            # NOTE (rickstaa): The scaling operation is already performed inside the
            # policy based on the `act_limits` you supplied.
            a = policy.choose_action(s)

            # Perform action in env
            s_, r, done, _ = env.step(a)

            # Increment global step count
            if training_started:
                global_step += 1

            # Stop episode if max_steps has been reached
            if jj == ENVS_PARAMS[ENV_NAME]["max_ep_steps"] - 1:
                done = True
            terminal = 1.0 if done else 0.0

            # Store experience in replay buffer
            pool.store(s, a, r, terminal, s_)

            # Optimize network weights and lagrance multipliers
            if (pool.memory_pointer > ALG_PARAMS["min_memory_size"]
                    and global_step % ALG_PARAMS["steps_per_cycle"] == 0):
                training_started = True

                # Perform STG a set number of times (train per cycle)
                for _ in range(ALG_PARAMS["train_per_cycle"]):
                    batch = pool.sample(ALG_PARAMS["batch_size"])
                    if policy.use_lyapunov:
                        (
                            labda,
                            alpha,
                            l_loss,
                            entropy,
                            a_loss,
                            alpha_loss,
                            labda_loss,
                        ) = policy.learn(lr_a_now, lr_l_now, lr_a, lr_c_now,
                                         batch)
                    else:
                        alpha, loss_q, entropy, a_loss, alpha_loss = policy.learn(
                            lr_a_now, lr_l_now, lr_a, lr_c_now, batch)

            # Store current path results
            if training_started:
                if policy.use_lyapunov:
                    current_path["rewards"].append(r)
                    current_path["lyapunov_error"].append(l_loss)
                    current_path["alpha"].append(alpha)
                    current_path["lambda"].append(labda)
                    current_path["entropy"].append(entropy)
                    current_path["a_loss"].append(a_loss)
                    current_path["alpha_loss"].append(alpha_loss)
                    current_path["lambda_loss"].append(labda_loss)
                else:
                    current_path["rewards"].append(r)
                    current_path["critic_error"].append(loss_q.numpy())
                    current_path["alpha"].append(alpha.numpy())
                    current_path["entropy"].append(entropy.numpy())
                    current_path["a_loss"].append(a_loss.numpy(
                    ))  # Improve: Check if this is the fastest way
                    current_path["alpha_loss"].append(alpha_loss)

            # Evalute the current policy performance and log the results
            if (training_started
                    and global_step % TRAIN_PARAMS["evaluation_frequency"] == 0
                    and global_step > 0):
                logger.logkv("total_timesteps", global_step)
                training_diagnostics = evaluate_training_rollouts(
                    last_training_paths)
                if training_diagnostics is not None:
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        eval_diagnostics = training_evaluation(
                            test_env, policy)
                        [
                            logger.logkv(key, eval_diagnostics[key])
                            for key in eval_diagnostics.keys()
                        ]
                        training_diagnostics.pop("return")
                    [
                        logger.logkv(key, training_diagnostics[key])
                        for key in training_diagnostics.keys()
                    ]
                    logger.logkv("lr_a", lr_a_now)
                    if policy.use_lyapunov:
                        logger.logkv("lr_l", lr_l_now)
                    else:
                        logger.logkv("lr_c", lr_c_now)
                    string_to_print = ["time_step:", str(global_step), "|"]
                    if TRAIN_PARAMS["num_of_evaluation_paths"] > 0:
                        [
                            string_to_print.extend(
                                [key, ":",
                                 str(eval_diagnostics[key]), "|"])
                            for key in eval_diagnostics.keys()
                        ]
                    [
                        string_to_print.extend([
                            key, ":",
                            str(round(training_diagnostics[key], 2)), "|"
                        ]) for key in training_diagnostics.keys()
                    ]
                    prefix = (colorize("LAC|", "green")
                              if ALG_PARAMS["use_lyapunov"] else colorize(
                                  "SAC|", "yellow"))
                    print(
                        colorize(prefix, "yellow", bold=True) +
                        "".join(string_to_print))
                logger.dumpkvs()

            # Update state
            s = s_

            # Check if episode is done (continue to next episode)
            if done:

                # Store paths
                if training_started:
                    last_training_paths.appendleft(current_path)

                # Decay learning rates
                frac = 1.0 - (global_step -
                              1.0) / TRAIN_PARAMS["max_global_steps"]
                lr_a_now = lr_a * frac  # learning rate for actor, lambda, alpha
                lr_l_now = lr_l * frac  # learning rate for Lyapunov critic
                lr_c_now = lr_c * frac  # learning rate for q critic
                break  # Continue to next episode

    # Increase episode counter
    global_episodes += 1