def learn(Q, operator, data, demand, min_env_flow, actions_report_file="", max_iter=5000, buffer_size=10000, batch_size=50, alpha=0.001, train_freq=1, eval_freq=50, eps_start=1.0, eps_end=0.02, exploration_fraction=0.2, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, seed=None, render=False, verbose=True): leap_year_demand = np.insert(demand, 60, demand[59]) if seed is not None: np.random.seed(seed) # mdp creation lake = Lakecomo(None, None, min_env_flow, None, None, seed=seed) years = data.year.unique() description = str(int(years[0])) + "-" + str(int(years[-1])) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: # leap years between 1946 and 2011 satisfy this condition even though it's not the complete leap year condition mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() if isinstance(operator, DQNOperator): operator._q_target._w = Q._w # Initialize policies schedule = np.linspace(eps_start, eps_end, int(exploration_fraction * max_iter)) pi = ScheduledGibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), schedule) pi_u = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=0) pi_g = Gibbs(Q, np.arange(mdp.N_DISCRETE_ACTIONS), tau=np.inf) # Add random episodes if needed init_samples = utils.generate_episodes( mdp, pi_u, n_episodes=random_episodes, preprocess=preprocess) if random_episodes > 0 else None if random_episodes > 0: t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.observation_space.shape[0], mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.observation_space.shape[0])).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 start_time = time.time() if actions_report_file: actions_executed = [] columns = list(range(mdp.N_DISCRETE_ACTIONS)) actions_report_df = pd.DataFrame(columns=columns) actions_report_df.to_csv(actions_report_file, index=False) done_counter = 0 # Learning for i in range(max_iter): # Take epsilon-greedy action wrt current Q-function s_prep = preprocess(s) a = pi.sample_action(s_prep) if actions_report_file: actions_executed.append(a) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = operator.gradient_be(Q, buffer.sample_batch(batch_size)) # Take a gradient step Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: if actions_report_file: actions_counts = np.bincount(actions_executed) actions_freqs = list(actions_counts / sum(actions_counts)) new_row = dict(zip(columns, actions_freqs)) actions_report_df = actions_report_df.append(new_row, ignore_index=True) actions_report_df.to_csv(actions_report_file, index=False) actions_executed = [] episode_rewards.append(0.0) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) s = mdp.reset() h = 0 episode_t.append(i) done_counter += 1 # Evaluate model if done_counter == eval_freq: # Evaluate greedy policy scores = [] for _ in range(eval_episodes): sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) scores.append(_single_year_eval(mdp, pi_g)) rew = np.mean(scores) learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) sampled_year = np.random.choice(years) inflow = list(data.loc[data['year'] == sampled_year, 'in']) if sampled_year % 4 == 0: mdp = LakeEnv(inflow, leap_year_demand, lake) else: mdp = LakeEnv(inflow, demand, lake) s = mdp.reset() end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, l_2_err, l_inf_err, elapsed_time)) done_counter = 0 if (i * 100 / max_iter) % 10 == 0: print("years:", description, "- Progress:", str(int(i * 100 / max_iter)) + "%") run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(Q._w) last_rewards = 5 print("years:", description, "- Last evaluation rewards:", np.around(evaluation_rewards[-last_rewards:], decimals=3)) return [[], weights, run_info]
def learn(mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha=0.001, train_freq=1, eval_freq=50, eps_start=1.0, eps_end=0.02, exploration_fraction=0.2, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, seed=None, render=False, verbose=True): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): # Q.init_weights() if isinstance(operator, DQNOperator): operator._q_target._w = Q._w # Initialize policies schedule = np.linspace(eps_start, eps_end, exploration_fraction * max_iter) pi = ScheduledEpsilonGreedy(Q, np.arange(mdp.action_space.n), schedule) pi_u = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=1) pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Add random episodes if needed init_samples = utils.generate_episodes( mdp, pi_u, n_episodes=random_episodes, preprocess=preprocess) if random_episodes > 0 else None if random_episodes > 0: t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 start_time = time.time() # Learning for i in range(max_iter): # Take epsilon-greedy action wrt current Q-function s_prep = preprocess(s) a = pi.sample_action(s_prep) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = operator.gradient_be(Q, buffer.sample_batch(batch_size)) # Take a gradient step Q._w, t, m_t, v_t = utils.adam(Q._w, g, t, m_t, v_t, alpha=alpha) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 episode_t.append(i) # Evaluate model if i % eval_freq == 0: # Evaluate greedy policy rew = utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) # Make sure we restart from s mdp.reset(s) end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, l_2_err, l_inf_err, elapsed_time)) # if np.mean(episode_rewards[-mean_episodes - 1:-1]) > -80: # render=True run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(Q._w) return [mdp.get_info(), weights, run_info]
def learn( mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, cholesky_clip=0.0001, bandwidth=0.00001, post_components=1, max_iter_ukl=60, eps=0.001, eta=1e-6, time_coherent=False, source_file=None, seed=None, render=False, verbose=True, ukl_tight_freq=1, sources=None, # Lambda function to calculate the weights weights_calculator=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() # Reset global variables global prior_eigen prior_eigen = None global cholesky_mask cholesky_mask = None global prior_normal prior_normal = None global posterior_normal posterior_normal = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size C = post_components # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources timesteps = len(weights) ws = [] # Take only 1 sample per timestep for i in range(timesteps): samples = weights[i] np.random.shuffle(samples) ws.append(samples[0][1]) # 0: first sample (random), 1: weights ws = np.array(ws) # The gaussian mixture weights are uniform if not provided. c_bar = np.ones( timesteps ) / timesteps if weights_calculator is None else weights_calculator(ws) # Take only gaussians with non-zero weights ws = ws[c_bar > 0] timesteps = len(ws) c_bar = c_bar[c_bar > 0] mu_bar = ws Sigma_bar = np.tile(np.eye(K) * bandwidth, (timesteps, 1, 1)) Sigma_bar_inv = np.tile((1 / bandwidth * np.eye(K))[np.newaxis], (timesteps, 1, 1)) # We initialize the parameters of the posterior to the best approximation of the posterior family to the prior c = np.ones(C) / C psi = c[:, np.newaxis] * c_bar[np.newaxis] phi = np.array(psi) mu = np.array([100 * np.random.randn(K) for _ in range(C)]) Sigma = np.array([np.eye(K) for _ in range(C)]) phi, psi = tight_ukl(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, max_iter=max_iter_ukl, eps=eps) params, phi, psi = init_posterior(c, mu, Sigma, c_bar, mu_bar, Sigma_bar, phi, psi, C, K, cholesky_clip, max_iter_ukl, max_iter=max_iter_ukl * 10, precision=Sigma_bar_inv, eta=eta, eps=eps, verbose=verbose) # Add random episodes if needed init_samples = list() if random_episodes > 0: w, _ = sample_gmm(random_episodes, c_bar, mu_bar, np.sqrt(Sigma_bar)) for i in range(random_episodes): Q._w = w[i] init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] l_2 = [] l_inf = [] fvals = [] episode_t = [] # Create masks for ADAM and SGD adam_mask = pack(np.zeros(C), np.ones((C, K)) * alpha_adam, np.zeros( (C, K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(C), np.zeros((C, K)), np.ones((C, K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, C, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, max_iter_ukl, C, K, precision=Sigma_bar_inv, t_step=i, ukl_tight_freq=ukl_tight_freq) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, C, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, C, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function c, mu, _ = unpack(params, C, K) rew = 0 for j in range(C): Q._w = mu[j] rew += utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] rew /= C learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, c_bar, mu_bar, Sigma_bar, operator, i + 1, phi, psi, n_weights, lambda_, C, K, precision=Sigma_bar_inv) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) if (i * 100 / max_iter) % 10 == 0: print("Seed: " + str(seed) + " - Progress: " + str(int(i * 100 / max_iter)) + "%") run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) print("Task over: ", mdp.get_info(), " - Last learning rewards: ", np.around(run_info[3][-5:], decimals=3)) return [mdp.get_info(), weights, run_info]
def learn(mdp, Q, operator, max_iter=5000, buffer_size=10000, batch_size=50, alpha_adam=0.001, alpha_sgd=0.1, lambda_=0.001, n_weights=10, train_freq=1, eval_freq=50, random_episodes=0, eval_states=None, eval_episodes=1, mean_episodes=50, preprocess=lambda x: x, sigma_reg=0.0001, cholesky_clip=0.0001, time_coherent=False, n_source=10, source_file=None, seed=None, render=False, verbose=True, sources=None): if seed is not None: np.random.seed(seed) # Randomly initialize the weights in case an MLP is used if isinstance(Q, MLPQFunction): Q.init_weights() global prior_eigen_torch prior_eigen_torch = None # Initialize policies pi_g = EpsilonGreedy(Q, np.arange(mdp.action_space.n), epsilon=0) # Get number of features K = Q._w.size # Load weights and construct prior distribution weights = utils.load_object(source_file) if sources is None else sources ws = np.array([w[1] for w in weights]) np.random.shuffle(ws) # Take only the first n_source weights ws = ws[:n_source, :] mu_bar = np.mean(ws, axis=0) Sigma_bar = np.cov(ws.T) # We use higher regularization for the prior to prevent the ELBO from diverging Sigma_bar_inv = np.linalg.inv(Sigma_bar + np.eye(K) * sigma_reg) # We initialize the parameters at the prior with smaller regularization (just to make sure Sigma_bar is pd) params = clip( pack(mu_bar, np.linalg.cholesky(Sigma_bar + np.eye(K) * cholesky_clip**2)), cholesky_clip, K) # Add random episodes if needed if random_episodes > 0: init_samples = list() for i in range(random_episodes): Q._w = sample_posterior(params, K) init_samples.append( utils.generate_episodes(mdp, pi_g, n_episodes=1, preprocess=preprocess)) init_samples = np.concatenate(init_samples) t, s, a, r, s_prime, absorbing, sa = utils.split_data( init_samples, mdp.state_dim, mdp.action_dim) init_samples = np.concatenate( (t[:, np.newaxis], preprocess(s), a, r[:, np.newaxis], preprocess(s_prime), absorbing[:, np.newaxis]), axis=1) # Figure out the effective state-dimension after preprocessing is applied eff_state_dim = preprocess(np.zeros(mdp.state_dim)).size # Create replay buffer buffer = Buffer(buffer_size, eff_state_dim) n_init_samples = buffer.add_all(init_samples) if random_episodes > 0 else 0 # Results iterations = [] episodes = [] n_samples = [] evaluation_rewards = [] learning_rewards = [] episode_rewards = [0.0] episode_t = [] l_2 = [] l_inf = [] fvals = [] # Create masks for ADAM and SGD adam_mask = pack(np.ones(K) * alpha_adam, np.zeros( (K, K))) # ADAM learns only \mu sgd_mask = pack(np.zeros(K), np.ones((K, K)) * alpha_sgd) # SGD learns only L # Adam initial params m_t = 0 v_t = 0 t = 0 # RMSprop for Variance v_t_var = 0. # Init env s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) start_time = time.time() # Learning for i in range(max_iter): # If we do not use time coherent exploration, resample parameters Q._w = sample_posterior(params, K) if not time_coherent else Q._w # Take greedy action wrt current Q-function s_prep = preprocess(s) a = np.argmax(Q.value_actions(s_prep)) # Step s_prime, r, done, _ = mdp.step(a) # Build the new sample and add it to the dataset buffer.add_sample(h, s_prep, a, r, preprocess(s_prime), done) # Take a step of gradient if needed if i % train_freq == 0: # Estimate gradient g = gradient(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Take a gradient step for \mu params, t, m_t, v_t = utils.adam(params, g, t, m_t, v_t, alpha=adam_mask) # Take a gradient step for L params = utils.sgd(params, g, alpha=sgd_mask) # params,v_t_var = utils.rmsprop(params, g, v_t_var, alpha=sgd_mask) # Clip parameters params = clip(params, cholesky_clip, K) # Add reward to last episode episode_rewards[-1] += r * mdp.gamma**h s = s_prime h += 1 if done or h >= mdp.horizon: episode_rewards.append(0.0) s = mdp.reset() h = 0 Q._w = sample_posterior(params, K) episode_t.append(i) # Evaluate model if i % eval_freq == 0: #Save current weights current_w = np.array(Q._w) # Evaluate MAP Q-function mu, _ = unpack(params, K) Q._w = mu rew = utils.evaluate_policy(mdp, pi_g, render=render, initial_states=eval_states, n_episodes=eval_episodes, preprocess=preprocess)[0] learning_rew = np.mean( episode_rewards[-mean_episodes - 1:-1]) if len(episode_rewards) > 1 else 0.0 br = operator.bellman_residual(Q, buffer.sample_batch(batch_size))**2 l_2_err = np.average(br) l_inf_err = np.max(br) fval = objective(buffer.sample_batch(batch_size), params, Q, mu_bar, Sigma_bar_inv, operator, i + 1, lambda_, n_weights) # Append results iterations.append(i) episodes.append(len(episode_rewards) - 1) n_samples.append(n_init_samples + i + 1) evaluation_rewards.append(rew) learning_rewards.append(learning_rew) l_2.append(l_2_err) l_inf.append(l_inf_err) fvals.append(fval) # Make sure we restart from s mdp.reset(s) # Restore weights Q._w = current_w end_time = time.time() elapsed_time = end_time - start_time start_time = end_time if verbose: print( "Iter {} Episodes {} Rew(G) {} Rew(L) {} Fval {} L2 {} L_inf {} time {:.1f} s" .format(i, episodes[-1], rew, learning_rew, fval, l_2_err, l_inf_err, elapsed_time)) run_info = [ iterations, episodes, n_samples, learning_rewards, evaluation_rewards, l_2, l_inf, fvals, episode_rewards[:len(episode_t)], episode_t ] weights = np.array(mu) return [mdp.get_info(), weights, run_info]