Exemplo n.º 1
0
def learn(Q,
          operator,
          data,
          demand,
          min_env_flow,
          actions_report_file="",
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha=0.001,
          train_freq=1,
          eval_freq=50,
          eps_start=1.0,
          eps_end=0.02,
          exploration_fraction=0.2,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          seed=None,
          render=False,
          verbose=True):

    leap_year_demand = np.insert(demand, 60, demand[59])

    if seed is not None:
        np.random.seed(seed)

    # mdp creation
    lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed)
    years = data.year.unique()
    description = str(int(years[0])) + "-" + str(int(years[-1]))
    sampled_year = np.random.choice(years)
    inflow = list(data.loc[data['year'] == sampled_year, 'in'])
    if sampled_year % 4 == 0:  # leap years between 1946 and 2011 satisfy this condition even though it's not the complete leap year condition
        mdp = LakeEnv(inflow, leap_year_demand, lake)
    else:
        mdp = LakeEnv(inflow, demand, lake)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()
        if isinstance(operator, DQNOperator):
            operator._q_target._w = Q._w

    # Initialize policies
    schedule = np.linspace(eps_start, eps_end,
                           int(exploration_fraction * max_iter))
    pi = ScheduledGibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), schedule)
    pi_u = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=0)
    pi_g = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=np.inf)

    # Add random episodes if needed
    init_samples = utils.generate_episodes(
        mdp, pi_u, n_episodes=random_episodes,
        preprocess=preprocess) if random_episodes > 0 else None
    if random_episodes > 0:
        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.observation_space.shape[0], mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.observation_space.shape[0])).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0

    start_time = time.time()

    if actions_report_file:
        actions_executed = []

        columns = list(range(mdp.N_DISCRETE_ACTIONS))
        actions_report_df = pd.DataFrame(columns=columns)
        actions_report_df.to_csv(actions_report_file, index=False)

    done_counter = 0

    # Learning
    for i in range(max_iter):

        # Take epsilon-greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = pi.sample_action(s_prep)
        if actions_report_file:
            actions_executed.append(a)

        # Step
        s_prime, r, done, _ = mdp.step(a)

        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = operator.gradient_be(Q, buffer.sample_batch(batch_size))
            # Take a gradient step
            Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime

        h += 1
        if done or h >= mdp.horizon:

            if actions_report_file:
                actions_counts = np.bincount(actions_executed)
                actions_freqs = list(actions_counts / sum(actions_counts))
                new_row = dict(zip(columns, actions_freqs))
                actions_report_df = actions_report_df.append(new_row,
                                                             ignore_index=True)
                actions_report_df.to_csv(actions_report_file, index=False)

                actions_executed = []

            episode_rewards.append(0.0)

            sampled_year = np.random.choice(years)
            inflow = list(data.loc[data['year'] == sampled_year, 'in'])
            if sampled_year % 4 == 0:
                mdp = LakeEnv(inflow, leap_year_demand, lake)
            else:
                mdp = LakeEnv(inflow, demand, lake)

            s = mdp.reset()

            h = 0
            episode_t.append(i)

            done_counter += 1

        # Evaluate model
        if done_counter == eval_freq:

            # Evaluate greedy policy
            scores = []
            for _ in range(eval_episodes):
                sampled_year = np.random.choice(years)
                inflow = list(data.loc[data['year'] == sampled_year, 'in'])
                if sampled_year % 4 == 0:
                    mdp = LakeEnv(inflow, leap_year_demand, lake)
                else:
                    mdp = LakeEnv(inflow, demand, lake)

                scores.append(_single_year_eval(mdp, pi_g))

            rew = np.mean(scores)

            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)

            sampled_year = np.random.choice(years)
            inflow = list(data.loc[data['year'] == sampled_year, 'in'])

            if sampled_year % 4 == 0:
                mdp = LakeEnv(inflow, leap_year_demand, lake)
            else:
                mdp = LakeEnv(inflow, demand, lake)

            s = mdp.reset()

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, l_2_err,
                            l_inf_err, elapsed_time))

            done_counter = 0

        if (i * 100 / max_iter) % 10 == 0:
            print("years:", description, "- Progress:",
                  str(int(i * 100 / max_iter)) + "%")

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(Q._w)

    last_rewards = 5
    print("years:", description, "- Last evaluation rewards:",
          np.around(evaluation_rewards[-last_rewards:], decimals=3))

    return [[], weights, run_info]
Exemplo n.º 2
0
def learn(mdp,
          Q,
          operator,
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha=0.001,
          train_freq=1,
          eval_freq=50,
          eps_start=1.0,
          eps_end=0.02,
          exploration_fraction=0.2,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          seed=None,
          render=False,
          verbose=True):
    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        # Q.init_weights()
        if isinstance(operator, DQNOperator):
            operator._q_target._w = Q._w

    # Initialize policies
    schedule = np.linspace(eps_start, eps_end, exploration_fraction * max_iter)
    pi = ScheduledEpsilonGreedy(Q, np.arange(mdp.action_space.n), schedule)
    pi_u = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=1)
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Add random episodes if needed
    init_samples = utils.generate_episodes(
        mdp, pi_u, n_episodes=random_episodes,
        preprocess=preprocess) if random_episodes > 0 else None
    if random_episodes > 0:
        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # Take epsilon-greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = pi.sample_action(s_prep)
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = operator.gradient_be(Q, buffer.sample_batch(batch_size))
            # Take a gradient step
            Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:
            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            # Evaluate greedy policy
            rew = utils.evaluate_policy(mdp,
                                        pi_g,
                                        render=render,
                                        initial_states=eval_states,
                                        n_episodes=eval_episodes,
                                        preprocess=preprocess)[0]
            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)

            # Make sure we restart from s
            mdp.reset(s)

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, l_2_err,
                            l_inf_err, elapsed_time))
        # if np.mean(episode_rewards[-mean_episodes - 1:-1]) > -80:
        #     render=True

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(Q._w)

    return [mdp.get_info(), weights, run_info]
Exemplo n.º 3
0
def learn(
        mdp,
        Q,
        operator,
        max_iter=5000,
        buffer_size=10000,
        batch_size=50,
        alpha_adam=0.001,
        alpha_sgd=0.1,
        lambda_=0.001,
        n_weights=10,
        train_freq=1,
        eval_freq=50,
        random_episodes=0,
        eval_states=None,
        eval_episodes=1,
        mean_episodes=50,
        preprocess=lambda x: x,
        cholesky_clip=0.0001,
        bandwidth=0.00001,
        post_components=1,
        max_iter_ukl=60,
        eps=0.001,
        eta=1e-6,
        time_coherent=False,
        source_file=None,
        seed=None,
        render=False,
        verbose=True,
        ukl_tight_freq=1,
        sources=None,
        # Lambda function to calculate the weights
        weights_calculator=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    # Reset global variables
    global prior_eigen
    prior_eigen = None
    global cholesky_mask
    cholesky_mask = None
    global prior_normal
    prior_normal = None
    global posterior_normal
    posterior_normal = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size
    C = post_components

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    timesteps = len(weights)
    ws = []
    # Take only 1 sample per timestep
    for i in range(timesteps):
        samples = weights[i]
        np.random.shuffle(samples)
        ws.append(samples[0][1])  # 0: first sample (random), 1: weights
    ws = np.array(ws)

    # The gaussian mixture weights are uniform if not provided.
    c_bar = np.ones(
        timesteps
    ) / timesteps if weights_calculator is None else weights_calculator(ws)

    # Take only gaussians with non-zero weights
    ws = ws[c_bar > 0]
    timesteps = len(ws)
    c_bar = c_bar[c_bar > 0]

    mu_bar = ws
    Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1))
    Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis],
                            (timesteps, 1, 1))

    # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior
    c = np.ones(C) / C
    psi = c[:, np.newaxis] * c_bar[np.newaxis]
    phi = np.array(psi)

    mu = np.array([100 * np.random.randn(K) for _ in range(C)])
    Sigma = np.array([np.eye(K) for _ in range(C)])

    phi, psi = tight_ukl(c,
                         mu,
                         Sigma,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         phi,
                         psi,
                         max_iter=max_iter_ukl,
                         eps=eps)
    params, phi, psi = init_posterior(c,
                                      mu,
                                      Sigma,
                                      c_bar,
                                      mu_bar,
                                      Sigma_bar,
                                      phi,
                                      psi,
                                      C,
                                      K,
                                      cholesky_clip,
                                      max_iter_ukl,
                                      max_iter=max_iter_ukl * 10,
                                      precision=Sigma_bar_inv,
                                      eta=eta,
                                      eps=eps,
                                      verbose=verbose)

    # Add random episodes if needed
    init_samples = list()
    if random_episodes > 0:
        w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar))
        for i in range(random_episodes):
            Q._w = w[i]
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    l_2 = []
    l_inf = []
    fvals = []
    episode_t = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.zeros(C),
                     np.ones((C, K)) * alpha_adam, np.zeros(
                         (C, K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(C), np.zeros((C, K)),
                    np.ones((C, K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, C, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size),
                         params,
                         Q,
                         c_bar,
                         mu_bar,
                         Sigma_bar,
                         operator,
                         i + 1,
                         phi,
                         psi,
                         n_weights,
                         lambda_,
                         max_iter_ukl,
                         C,
                         K,
                         precision=Sigma_bar_inv,
                         t_step=i,
                         ukl_tight_freq=ukl_tight_freq)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, C, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, C, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            c, mu, _ = unpack(params, C, K)
            rew = 0
            for j in range(C):
                Q._w = mu[j]
                rew += utils.evaluate_policy(mdp,
                                             pi_g,
                                             render=render,
                                             initial_states=eval_states,
                                             n_episodes=eval_episodes,
                                             preprocess=preprocess)[0]
            rew /= C

            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size),
                             params,
                             Q,
                             c_bar,
                             mu_bar,
                             Sigma_bar,
                             operator,
                             i + 1,
                             phi,
                             psi,
                             n_weights,
                             lambda_,
                             C,
                             K,
                             precision=Sigma_bar_inv)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

        if (i * 100 / max_iter) % 10 == 0:
            print("Seed: " + str(seed) + " - Progress: " +
                  str(int(i * 100 / max_iter)) + "%")

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    print("Task over: ", mdp.get_info(), " - Last learning rewards: ",
          np.around(run_info[3][-5:], decimals=3))

    return [mdp.get_info(), weights, run_info]
Exemplo n.º 4
0
def learn(mdp,
          Q,
          operator,
          max_iter=5000,
          buffer_size=10000,
          batch_size=50,
          alpha_adam=0.001,
          alpha_sgd=0.1,
          lambda_=0.001,
          n_weights=10,
          train_freq=1,
          eval_freq=50,
          random_episodes=0,
          eval_states=None,
          eval_episodes=1,
          mean_episodes=50,
          preprocess=lambda x: x,
          sigma_reg=0.0001,
          cholesky_clip=0.0001,
          time_coherent=False,
          n_source=10,
          source_file=None,
          seed=None,
          render=False,
          verbose=True,
          sources=None):

    if seed is not None:
        np.random.seed(seed)

    # Randomly initialize the weights in case an MLP is used
    if isinstance(Q, MLPQFunction):
        Q.init_weights()

    global prior_eigen_torch
    prior_eigen_torch = None

    # Initialize policies
    pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0)

    # Get number of features
    K = Q._w.size

    # Load weights and construct prior distribution
    weights = utils.load_object(source_file) if sources is None else sources
    ws = np.array([w[1] for w in weights])
    np.random.shuffle(ws)
    # Take only the first n_source weights
    ws = ws[:n_source, :]
    mu_bar = np.mean(ws, axis=0)
    Sigma_bar = np.cov(ws.T)
    # We use higher regularization for the prior to prevent the ELBO from diverging
    Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg)
    # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd)
    params = clip(
        pack(mu_bar,
             np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)),
        cholesky_clip, K)

    # Add random episodes if needed
    if random_episodes > 0:
        init_samples = list()
        for i in range(random_episodes):
            Q._w = sample_posterior(params, K)
            init_samples.append(
                utils.generate_episodes(mdp,
                                        pi_g,
                                        n_episodes=1,
                                        preprocess=preprocess))
        init_samples = np.concatenate(init_samples)

        t, s, a, r, s_prime, absorbing, sa = utils.split_data(
            init_samples, mdp.state_dim, mdp.action_dim)
        init_samples = np.concatenate(
            (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis],
             preprocess(s_prime), absorbing[:, np.newaxis]),
            axis=1)

    # Figure out the effective state-dimension after preprocessing is applied
    eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size

    # Create replay buffer
    buffer = Buffer(buffer_size, eff_state_dim)
    n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0

    # Results
    iterations = []
    episodes = []
    n_samples = []
    evaluation_rewards = []
    learning_rewards = []
    episode_rewards = [0.0]
    episode_t = []
    l_2 = []
    l_inf = []
    fvals = []

    # Create masks for ADAM and SGD
    adam_mask = pack(np.ones(K) * alpha_adam, np.zeros(
        (K, K)))  # ADAM learns only \mu
    sgd_mask = pack(np.zeros(K),
                    np.ones((K, K)) * alpha_sgd)  # SGD learns only L

    # Adam initial params
    m_t = 0
    v_t = 0
    t = 0

    # RMSprop for Variance
    v_t_var = 0.

    # Init env
    s = mdp.reset()
    h = 0
    Q._w = sample_posterior(params, K)

    start_time = time.time()

    # Learning
    for i in range(max_iter):

        # If we do not use time coherent exploration, resample parameters
        Q._w = sample_posterior(params, K) if not time_coherent else Q._w
        # Take greedy action wrt current Q-function
        s_prep = preprocess(s)
        a = np.argmax(Q.value_actions(s_prep))
        # Step
        s_prime, r, done, _ = mdp.step(a)
        # Build the new sample and add it to the dataset
        buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done)

        # Take a step of gradient if needed
        if i % train_freq == 0:
            # Estimate gradient
            g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar,
                         Sigma_bar_inv, operator, i + 1, lambda_, n_weights)

            # Take a gradient step for \mu
            params, t, m_t, v_t = utils.adam(params,
                                             g,
                                             t,
                                             m_t,
                                             v_t,
                                             alpha=adam_mask)
            # Take a gradient step for L
            params = utils.sgd(params, g, alpha=sgd_mask)
            # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask)
            # Clip parameters
            params = clip(params, cholesky_clip, K)

        # Add reward to last episode
        episode_rewards[-1] += r * mdp.gamma**h

        s = s_prime
        h += 1
        if done or h >= mdp.horizon:

            episode_rewards.append(0.0)
            s = mdp.reset()
            h = 0
            Q._w = sample_posterior(params, K)
            episode_t.append(i)

        # Evaluate model
        if i % eval_freq == 0:

            #Save current weights
            current_w = np.array(Q._w)

            # Evaluate MAP Q-function
            mu, _ = unpack(params, K)
            Q._w = mu
            rew = utils.evaluate_policy(mdp,
                                        pi_g,
                                        render=render,
                                        initial_states=eval_states,
                                        n_episodes=eval_episodes,
                                        preprocess=preprocess)[0]
            learning_rew = np.mean(
                episode_rewards[-mean_episodes -
                                1:-1]) if len(episode_rewards) > 1 else 0.0
            br = operator.bellman_residual(Q,
                                           buffer.sample_batch(batch_size))**2
            l_2_err = np.average(br)
            l_inf_err = np.max(br)
            fval = objective(buffer.sample_batch(batch_size), params, Q,
                             mu_bar, Sigma_bar_inv, operator, i + 1, lambda_,
                             n_weights)

            # Append results
            iterations.append(i)
            episodes.append(len(episode_rewards) - 1)
            n_samples.append(n_init_samples + i + 1)
            evaluation_rewards.append(rew)
            learning_rewards.append(learning_rew)
            l_2.append(l_2_err)
            l_inf.append(l_inf_err)
            fvals.append(fval)

            # Make sure we restart from s
            mdp.reset(s)

            # Restore weights
            Q._w = current_w

            end_time = time.time()
            elapsed_time = end_time - start_time
            start_time = end_time

            if verbose:
                print(
                    "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s"
                    .format(i, episodes[-1], rew, learning_rew, fval, l_2_err,
                            l_inf_err, elapsed_time))

    run_info = [
        iterations, episodes, n_samples, learning_rewards, evaluation_rewards,
        l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t
    ]
    weights = np.array(mu)

    return [mdp.get_info(), weights, run_info]