def test_random_choice_for_terminal_state(self): policy = RandomPolicy() policy.initialize_state(state='terminal', available_actions=set()) suggestion = policy.suggest_action_for_state('terminal') self.assertIsNone(suggestion)
def test_random_recommendation_in_available_actions(self): pol = RandomPolicy() pol.initialize_state('s1', available_actions={'a1', 'a2', 'a3'}) a0 = pol.suggest_action_for_state('s1') self.assertTrue(a0 in {'a1', 'a2', 'a3'})
def off_mc(): env = Env(6) policy = RandomPolicy(env.actions()) C = defaultdict(float) Q = defaultdict(float) Pi = {} for i in range(10000): G = 0 W = 1.0 n = 0 states = get_episode(env, policy) for (s0, a, s1, r) in reversed(states): n += 1 G = 0.9 * G + r C[(s0, a)] += W Q[(s0, a)] += W / C[(s0, a)] * (G - Q[(s0, a)]) Pi[s0] = max([(x, Q[(s0, x)]) for x in env.actions()], key=lambda x: x[1])[0] if a != Pi[s0]: break W = W / policy.get_p(s0, a) for t in env.get_t(): Pi[t] = 'ter' env.render(Pi)
def init_pols(p, env): # Duplicate the policy global GLOBAL_POLS n = _pool().n_jobs #GLOBAL_POLS = [p.copy(env) for _ in range(n)] from policy import RandomPolicy GLOBAL_POLS = [RandomPolicy(env.action_space) for _ in range(n)]
def test_random_choice_called(self): policy = RandomPolicy() policy.initialize_state('s1', available_actions=['a1', 'a2', 'a3']) mocked_random_choice = MagicMock(return_value='a2') # If the state is not yet known, a random available action is returned. with patch('random.choice', mocked_random_choice): a0 = policy.suggest_action_for_state('s1') # Result is determined by the mocked "random" choice self.assertEqual('a2', a0) # Arguments of the mocked "random" choice should be available actions mocked_random_choice.assert_called_with(['a1', 'a2', 'a3'])
def test_warehouse_02(): env = make_test_warehouse_env_01() expected_value = None policy = RandomPolicy() we.execute(env, policy) print('**' * 30) print('[Result]') print('Finish time clock value=', env.finish_time_clock, ':uncompleted orders=', len(env.available_orders))
def test_value_for(self): planner_policy = MagicMock() qtable_policy = QTablePolicy() random_policy = RandomPolicy() policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Evaluation of a state-action pair should be the same as for the qtable policy. policy.initialize_state('s1', {'a1', 'a2'}) policy.update('s1', 'a2', -1.23) self.assertEqual(-1.23, policy.value_for('s1', 'a2'))
def get_dataset(n_trajectories=100, len_trajectories=1000, policy=RandomPolicy(), list_of_traj=False, verbose=True, env=None): """ Generate a dataset for FQI.""" X = [] X_next = [] Y = [] start_time = time.time() for j in range(n_trajectories): if verbose and j % (math.ceil(n_trajectories / 10)) == 0: remaining_iterations = n_trajectories - j elapsed_time = time.time() - start_time remaining_time = 0 if j > 0: remaining_time = elapsed_time / j * remaining_iterations print( "Dataset generated at {}%, elapsed time {:.0f}s, remaining time {:.0f}s" .format(int(j / n_trajectories * 100), elapsed_time, remaining_time)) traj, rewards, next = Agent.generate_trajectory(len_trajectories, policy=RandomPolicy(), stop_at_terminal=False, env=env) if list_of_traj: X.append(traj) X_next.append(next) Y.append(rewards) else: X.extend(traj) X_next.extend(next) Y.extend(rewards) return [np.array(X), np.array(Y), np.array(X_next)]
def test_update(self): planner_policy = MagicMock() qtable_policy = MagicMock(spec=QTablePolicy()) random_policy = MagicMock(spec=RandomPolicy()) policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Updating the policy should update the qtable policy as well. policy.initialize_state('s1', {'a1', 'a2'}) policy.update('s1', 'a2', -1.23) qtable_policy.update.assert_called_with('s1', 'a2', -1.23)
def test_new_state(self): policy = RandomPolicy() self.assertTrue(policy.is_new_state(state='s1')) policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'}) self.assertFalse(policy.is_new_state('s1'))
def get_policy(self): if self.policy_name == "random": print("Policy: Random") policy = RandomPolicy(self.devices, self.app, self.emulator_path, self.android_system, self.root_path, self.pro_click, self.pro_longclick, self.pro_scroll, self.pro_edit, self.pro_naturalscreen, self.pro_leftscreen, self.pro_back, self.pro_splitscreen, self.pro_home) else: print("No valid input policy specified. Using policy \"none\".") policy = None return policy
def test_optimal_value_for(self): planner_policy = MagicMock() qtable_policy = QTablePolicy() random_policy = RandomPolicy() policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) # Evaluation of a state-action pair should be the same as for the qtable policy. policy.initialize_state('s', {'a', 'b', 'c'}) policy.update('s', 'a', 1.23) policy.update('s', 'b', -5.43) policy.update('s', 'c', 0.03) self.assertEqual(1.23, policy.optimal_value_for('s'))
def test_is_new_state(self): qtable_policy = QTablePolicy() random_policy = RandomPolicy() mdp_builder = VacuumCleanerWorldBuilder() mdp = mdp_builder.build_mdp() planner_policy = PlannerPolicy(planning_horizon=1, mdp_builder=mdp_builder) policy = PlanningExploringStartsPolicy(planner_policy, random_policy, qtable_policy) self.assertTrue(policy.is_new_state(state='s1')) policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'}) self.assertFalse(policy.is_new_state('s1'))
def test_warehouse_random(order_count, max_iteration): start = time.time() #반복수행 best = 99999999999 for i in range(max_iteration): env = make_test_warehouse_env(order_count) #order 60개기준 policy = RandomPolicy() we.execute(env, policy) if best > env.finish_time_clock: best = env.finish_time_clock #print('Finish time clock value=', env.finish_time_clock,':uncompleted orders=',len(env.available_orders)) print('[Result] RandomPolicy') print("Random Best=", best) end = time.time() print('time', (end - start))
def generate_trajectory(iterations, policy=RandomPolicy(), stop_at_terminal=True, env=None): """Generate a trajectory following the policy of the agent""" init_state = env.reset( ) # should return a state vector if everything worked trajectory = [] rewards = [] x_next = [] agent = Agent(env, init_state, policy) curr_state = init_state for _ in range(iterations): action = policy.get_action(agent) obs, reward, done, _ = env.step([action]) # Current state t = [action] t.extend(curr_state) trajectory.append(t) # Reward if done: reward = reward - 10 rewards.append(reward) # Next state x_next.append(obs) if stop_at_terminal == True and done == True: break return [trajectory, rewards, x_next]
from game import GameConfig from policy import RandomPolicy from data.sql import store_record as sql_store from data.sql import close as sql_close from play import play_games ########################### ### Configuration to change ########################### STORE_IN_SQL = True ########################### ### Initialize the env ########################### config = GameConfig() print(config) writer = sql_store if STORE_IN_SQL else None players = lambda b: [RandomPolicy(b, 'b'), RandomPolicy(b, 'w')] play_games(config, players=players, writer=writer) sql_close()
def main(): num_episodes = 5000 steps_per_episode = 200 epochs = 100 batch_size = 100 render = False tensorboard_callback_reward = keras.callbacks.TensorBoard( log_dir="logs/scalars/rewards" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) tensorboard_callback_transitions = keras.callbacks.TensorBoard( log_dir="logs/scalars/transitions" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) env = gym.make('Trajectory-v0') episodes = collect_data(env, num_episodes, steps_per_episode, RandomPolicy(env), render) # transitions, rewards = prepare_datasets(episodes) transitions_lstm, rewards_lstm = prepare_datasets_lstm(episodes) validation_episodes = collect_data(env, num_episodes, steps_per_episode, RandomPolicy(env), render) validation_transitions_lstm, validation_rewards_lstm = prepare_datasets_lstm( validation_episodes) # transition_net = keras.Sequential([ # keras.layers.Input(shape=(3+env.num_observables, env.num_dimensions)), # keras.layers.Flatten(), # keras.layers.Dense(32, activation='relu'), # keras.layers.Dense(32, activation='relu'), # keras.layers.Dense((2+env.num_observables)*env.num_dimensions), # keras.layers.Reshape((2+env.num_observables, env.num_dimensions)) # ]) # transition_net.compile(optimizer='adam', loss='mse') # reward_net = keras.Sequential([ # keras.layers.Input(shape=(3+env.num_observables, env.num_dimensions)), # keras.layers.Flatten(), # keras.layers.Dense(32, activation='relu'), # keras.layers.Dense(32, activation='relu'), # keras.layers.Dense(1) # ]) # reward_net.compile(optimizer='adam', loss='mse') transition_net_lstm = keras.Sequential([ keras.layers.Input(shape=(steps_per_episode, 3 + env.num_observables, env.num_dimensions)), keras.layers.Reshape((steps_per_episode, (3 + env.num_observables) * env.num_dimensions)), keras.layers.LSTM(32, return_sequences=False), keras.layers.Dense((2 + env.num_observables) * env.num_dimensions), keras.layers.Reshape((2 + env.num_observables, env.num_dimensions)) ]) transition_net_lstm.compile(optimizer='adam', loss='mse') reward_net_lstm = keras.Sequential([ keras.layers.Input(shape=(steps_per_episode, 3 + env.num_observables, env.num_dimensions)), keras.layers.Reshape((steps_per_episode, (3 + env.num_observables) * env.num_dimensions)), keras.layers.LSTM(32, return_sequences=False), keras.layers.Dense(1) ]) reward_net_lstm.compile(optimizer='adam', loss='mse') transition_net_lstm.fit( transitions_lstm[0], transitions_lstm[1][:, -1], epochs=epochs, batch_size=batch_size, validation_data=(validation_transitions_lstm[0], validation_transitions_lstm[1][:, -1]), callbacks=[tensorboard_callback_transitions]) reward_net_lstm.fit(rewards_lstm[0], rewards_lstm[1][:, -1], epochs=epochs, batch_size=batch_size, validation_data=(validation_rewards_lstm[0], validation_rewards_lstm[1][:, -1]), callbacks=[tensorboard_callback_reward])
import sys import os.path sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)) import argparse import numpy as np #from gym import wrappers import gym from memory import ReplayMemory from policy import EpsilonPolicy, RandomPolicy from agent import DQNAgent from processor import VoidProcessor from model import FullyConnectedModel parser = argparse.ArgumentParser() args = parser.parse_args() env = gym.make('CartPole-v1') num_actions = env.action_space.n model = FullyConnectedModel(num_actions=num_actions, neurons_per_layer=5, num_layers=2, learning_rate=0.002, load_weights_file=None) memory = ReplayMemory(maxlen=10000, game_over_bias=10) processor = VoidProcessor() policy = RandomPolicy() dqn = DQNAgent(env=env, memory=memory, policy=policy, model=model, discount_rate=0.99, processor=processor) dqn.play(delay=0.2)
def temp_replace_policy(self): if self.run_type is RunType.RAND_FILL: self.agent.currently_used_policy = RandomPolicy() if self.run_type is RunType.TEST: self.agent.currently_used_policy = GreedyPolicy()
def FQI(possible_actions, iterations, verbose=True, gamma=0.99, env=None): """ FQI algorithm. Take as input a Learning set (X, Y, and X_next) and the name of the model to use""" model = ExtraTreesRegressor(50) # Y_0 = Y start_time = time.time() alive_times = [] rewards_means = [] policy = RandomPolicy() X, Y, X_next = dataset_util.get_dataset(100, 200, list_of_traj=False, env=env, policy=policy, verbose=False) for j in range(iterations): Y_0 = Y model.fit(X, Y) # Update Y Y = [] for i, x_next in enumerate(X_next): to_predict = np.array( list( map(lambda u: np.concatenate(([u], x_next)), possible_actions))) max_prediction = max(model.predict(to_predict)) Y.append(Y_0[i] + gamma * max_prediction) j = j + 1 policy = OptimalPolicyDiscrete(possible_actions, model) # Testing alive = [] rews = [] for k in range(128): init_state = env.reset() agent = Agent(env, init_state) agent.policy = policy done = False reward = 0 decay = 1 steps = 0 while (done == False): _, r, done = agent.step() reward += r * decay decay *= 0.99 steps += 1 alive.append(steps) rews.append(reward) alive_times.append(np.mean(alive)) rewards_means.append(np.mean(rews)) # Printing for verbose mode if verbose: remaining_iterations = iterations - j elapsed_time = time.time() - start_time remaining_time = 0 if j > 0: remaining_time = elapsed_time / j * remaining_iterations print( "Fit {}, elapsed time {:.0f}s, remaining time {:.0f}s, alive steps = {:.1f}, reward = {:.1f}" .format(j, elapsed_time, remaining_time, alive_times[-1], rewards_means[-1])) return [model, alive_times, rewards_means]
return pred_gp_mean, pred_gp_variance, rollout_gp, pred_gp_mean_trajs, pred_gp_variance_trajs, rollout_gp_trajs if __name__ == '__main__': import matplotlib.pyplot as plt plt.style.use('ggplot') from cartpole_sim import CartpoleSim from policy import SwingUpAndBalancePolicy, RandomPolicy from visualization import Visualizer import cv2 vis = Visualizer(cartpole_length=1.5, x_lim=(0.0, DELTA_T * NUM_DATAPOINTS_PER_EPOCH)) swingup_policy = SwingUpAndBalancePolicy('policy.npz') random_policy = RandomPolicy(seed=12831) sim = CartpoleSim(dt=DELTA_T) # Initial training data used to train GP for the first epoch init_state = np.array([0.01, 0.01, np.pi * 0.5, 0.1]) * rng.randn(4) ts, state_traj, action_traj = sim_rollout(sim, random_policy, NUM_DATAPOINTS_PER_EPOCH, DELTA_T, init_state) delta_state_traj = state_traj[1:] - state_traj[:-1] train_x, train_y = make_training_data(state_traj[:-1], action_traj, delta_state_traj) for epoch in range(NUM_TRAINING_EPOCHS): vis.clear()
type=bool, default=True, help="True for density sensor, false for minimum") return parser.parse_args() def joint_action(policies, joint_input): return [f(x).data.numpy() for f, x in zip(policies, joint_input)] if __name__ == "__main__": arglist = parse_args() domain = Task_Rovers(arglist) obs = domain.reset() policy = RandomPolicy(output_shape=2, low=-1, high=1) policies = [policy.get_next() for _ in range(arglist.num_rover)] networks = [Evo_MLP(12, 2) for _ in range(arglist.num_rover)] policies = [net.get_next() for net in networks] updates = [net.get_evo() for net in networks] obs = domain.reset() for _ in range(arglist.num_timestep): print("Step") action = joint_action(policies, obs) print(action) for f in updates: f() action = joint_action(policies, obs) print(action)
import data_utils from mdp import MDP from rewards import reward_func_linear # Call it with stats to initialize from env import Env from q_learning import QLearningAlgo from policy import EpsilonGreedyPolicy, GreedyPolicy, RandomPolicy data = data_utils.Data(n=15) mdp = MDP(data=data) reward_func = reward_func_linear(data.statistics, verbose=False) env = Env(reward_func=reward_func, mode='human') # policy = EpsilonGreedyPolicy(action_space = mdp.action_space) policy = RandomPolicy(action_space=mdp.action_space) test_policy = GreedyPolicy(action_space=mdp.action_space) algo = QLearningAlgo(env=env, mdp=mdp, policy=policy, discount=0.2) algo.set_mode('train') algo.fit(mode='train', epochs=4, remember=True) algo.set_mode('test') algo.test(mode='test', policy=test_policy) algo.replay(batch_size=16, epochs=8) algo.set_mode('test') algo.test(mode='test', policy=test_policy) # algo.test(mode = 'human', policy = test_policy) import numpy as np import matplotlib.pyplot as plt
goal = tuple(np.random.randint(0, 10, [2])) return goal def plot(rewards_trails, color): for count in range(10): plt.plot(np.arange(10000), rewards_trails[count], linestyle='dotted') line, = plt.plot(np.arange(10000), np.average(rewards_trails, axis=0), linestyle='solid', color=color) return line manual_agent() random_policy_rewards = non_learning_agent(RandomPolicy()) worse_policy_rewards = non_learning_agent(WorsePolicy()) better_policy_rewards = non_learning_agent(BetterThanRandomPolicy()) line1 = plot(random_policy_rewards, 'red') line2 = plot(worse_policy_rewards, 'green') line3 = plot(better_policy_rewards, 'blue') plt.ylabel("Cumulative reward") plt.xlabel("Steps") plt.legend([line1, line2, line3], ['random', 'worse', 'better']) plt.savefig('qt4.png') plt.show() random_goal = get_random_goal() print(f"New goal is {random_goal}") random_policy_rewards = non_learning_agent(RandomPolicy(), random_goal) worse_policy_rewards = non_learning_agent(WorsePolicy(), random_goal)