def year_pass(k, v): Q = utils.load_object(etr_path + v["policy"]) task = v["task"] task.starting_day_index = 0 task.reset() num_days = task.n_days if n_jobs == 1: outputs = [day_pass(k, v, d) for d in range(num_days)] elif n_jobs > 1: outputs = Parallel(n_jobs=n_jobs, max_nbytes=None)(delayed(day_pass)(k, v, d) for d in range(num_days)) days = [] actions = np.zeros((num_days, len(task.prices[0]))) rewards = np.zeros((num_days, len(task.prices[0]))) state_value_list = [] for (d, r, a, svl) in outputs: days.append(d) rewards[d, :] = r actions[d, :] = a state_value_list.extend(svl) print("Days:", len(days)) print("Rewards sum:", np.sum(rewards)) print("State values list length:", len(state_value_list)) utils.save_object(state_value_list, save_dataset_path + k) utils.save_object([days, actions, rewards], save_actions_path + k)
lambda_=lambda_, n_weights=n_weights, train_freq=train_freq, eval_freq=eval_freq, random_episodes=random_episodes, eval_states=eval_states, mean_episodes=mean_episodes, preprocess=rbf, sigma_reg=sigma_reg, cholesky_clip=cholesky_clip, time_coherent=time_coherent, n_source=n_source, source_file=source_file, seed=seed, render=render, verbose=verbose) seeds = [ 9, 44, 404, 240, 259, 141, 371, 794, 41, 507, 819, 959, 829, 558, 638, 127, 672, 4, 635, 687 ] seeds = seeds[:n_runs] if n_jobs == 1: results = [run(mdp, seed) for (mdp, seed) in zip(mdps, seeds)] elif n_jobs > 1: results = Parallel(n_jobs=n_jobs)(delayed(run)(mdp, seed) for (mdp, seed) in zip(mdps, seeds)) utils.save_object(results, file_name)
lambda_=lambda_, n_weights=n_weights, train_freq=train_freq, eval_freq=eval_freq, random_episodes=random_episodes, eval_states=eval_states, mean_episodes=mean_episodes, preprocess=rbf, sigma_reg=sigma_reg, cholesky_clip=cholesky_clip, time_coherent=time_coherent, n_source=i, source_file=source_file, seed=seed, render=render, verbose=verbose) seeds = [ 9, 44, 404, 240, 259, 141, 371, 794, 41, 507, 819, 959, 829, 558, 638, 127, 672, 4, 635, 687 ] seeds = seeds[:n_runs] if n_jobs == 1: results = [run(mdp, seed) for (mdp, seed) in zip(mdps, seeds)] elif n_jobs > 1: results = Parallel(n_jobs=n_jobs)(delayed(run)(mdp, seed) for (mdp, seed) in zip(mdps, seeds)) scores.append([i, results]) utils.save_object(scores, file_name)
def transfer(dataset_path, mdp, save_path, iterations, year, seed=0): np.random.seed(seed) data = utils.load_object(dataset_path) data = np.array(data) state_dim = mdp.state_dim n_actions = mdp.action_space.n mdp.starting_day_index = 0 mdp.reset() day_length = len(mdp.prices[0]) Q = MLPQFunction(state_dim, n_actions, layers=layers) Q.init_weights() m_t = 0 v_t = 0 t = 0 utils.save_object([], save_path) losses = [[], [], []] for i in range(iterations): # sample time of day time = int(np.random.uniform(low=0, high=day_length)) datapoints = np.arange(0, len(data) - day_length, day_length) datapoints += time datapoints = data[datapoints] np.random.shuffle(datapoints) datapoints = datapoints[:batch_size] for a in range(n_actions): with torch.autograd.set_detect_anomaly(True): train_loss, grad = compute_gradient_single_action( Q, datapoints, a) losses[a].append(train_loss) print( "Y: {0}, I: {1:5d}, Time: {2:4d}, A: {3:1d}, Grad: {4:8.6f}, Train Loss: {5:8.6f}" .format(year, i, time, a, np.linalg.norm(grad), train_loss)) Q._w, t, m_t, v_t = utils.adam(Q._w, grad, t, m_t, v_t, alpha=alpha) if save_freq > 0 and i % save_freq == 0: past_Qs = utils.load_object(save_path) past_Qs.append(np.array(Q._w)) utils.save_object(past_Qs, save_path) plot_actions(dataset_path, Q._w, i, mdp, n_actions_plot, path + "/plot-" + year + "-" + str(i)) print( "Model selected index: {0:4d}, Train Loss: [{1:8.6f}, {2:8.6f}, {3:8.6f}]" .format(i, losses[0][i], losses[1][i], losses[2][i])) return [mdp.get_info(), np.array(Q._w), losses]
Q._w, t, m_t, v_t = utils.adam(Q._w, grad, t, m_t, v_t, alpha=alpha) if save_freq > 0 and i % save_freq == 0: past_Qs = utils.load_object(save_path) past_Qs.append(np.array(Q._w)) utils.save_object(past_Qs, save_path) plot_actions(dataset_path, Q._w, i, mdp, n_actions_plot, path + "/plot-" + year + "-" + str(i)) print( "Model selected index: {0:4d}, Train Loss: [{1:8.6f}, {2:8.6f}, {3:8.6f}]" .format(i, losses[0][i], losses[1][i], losses[2][i])) return [mdp.get_info(), np.array(Q._w), losses] results = [] for k, v in dataset.items(): print(k) results.append( [transfer(v['data'], v['mdp'], v['save_path'], iterations, k)]) utils.save_object(results, sources_file_name) utils.save_object(tasks, tasks_file_name)
actions[di, task.current_timestep] = a - 1 # [0, 2] -> [-1, 1] rewards[di, task.current_timestep] = r print("{0:s} - Day: {1:4d}, Cumulative reward: {2:8.6f}".format( k, di, np.sum(rewards))) return [days, actions, rewards, state_value_list] def make_Q(weights, task): # task params state_dim = task.state_dim action_dim = 1 n_actions = task.action_space.n return MLPQFunction(state_dim, n_actions, layers=layers, initial_params=weights) for k, v in w_dict.items(): print(k) Q = make_Q(v["weights"], v["task"]) v["task"].starting_day_index = 0 v["task"].reset() output = year_pass(Q, v["task"]) print(len(output)) utils.save_object(output, save_actions_path + k)
s = task.reset() s = [s] print("Day index:", di) days.append(task.selected_day) done = False while not done: a = np.argmax(Q._q_values(s)) s, r, done, _ = task.step(a) s = [s] actions[di, task.current_timestep] = a - 1 # [0, 2] -> [-1, 1] rewards[di, task.current_timestep] = r print("Cumulative reward:", np.sum(rewards)) return [days, actions, rewards] for k, v in etrs.items(): print(k) Q = utils.load_object(etr_path + v["policy"]) task = v["task"] task.starting_day_index = 0 task.reset() output = year_pass(Q, task) utils.save_object(output, "visualize-actions/" + k)