Exemplo n.º 1
0
    def test_random_choice_for_terminal_state(self):

        policy = RandomPolicy()
        policy.initialize_state(state='terminal', available_actions=set())
        suggestion = policy.suggest_action_for_state('terminal')

        self.assertIsNone(suggestion)
Exemplo n.º 2
0
    def test_random_recommendation_in_available_actions(self):

        pol = RandomPolicy()
        pol.initialize_state('s1', available_actions={'a1', 'a2', 'a3'})

        a0 = pol.suggest_action_for_state('s1')
        self.assertTrue(a0 in {'a1', 'a2', 'a3'})
Exemplo n.º 3
0
def off_mc():
    env = Env(6)
    policy = RandomPolicy(env.actions())
    C = defaultdict(float)
    Q = defaultdict(float)
    Pi = {}
    for i in range(10000):
        G = 0
        W = 1.0
        n = 0
        states = get_episode(env, policy)
        for (s0, a, s1, r) in reversed(states):
            n += 1
            G = 0.9 * G + r
            C[(s0, a)] += W
            Q[(s0, a)] += W / C[(s0, a)] * (G - Q[(s0, a)])
            Pi[s0] = max([(x, Q[(s0, x)]) for x in env.actions()],
                         key=lambda x: x[1])[0]
            if a != Pi[s0]:
                break
            W = W / policy.get_p(s0, a)

    for t in env.get_t():
        Pi[t] = 'ter'
    env.render(Pi)
Exemplo n.º 4
0
def init_pols(p, env):
    # Duplicate the policy
    global GLOBAL_POLS
    n = _pool().n_jobs
    #GLOBAL_POLS = [p.copy(env) for _ in range(n)] 
    from policy import RandomPolicy
    GLOBAL_POLS = [RandomPolicy(env.action_space) for _ in range(n)]
Exemplo n.º 5
0
    def test_random_choice_called(self):

        policy = RandomPolicy()
        policy.initialize_state('s1', available_actions=['a1', 'a2', 'a3'])

        mocked_random_choice = MagicMock(return_value='a2')

        # If the state is not yet known, a random available action is returned.
        with patch('random.choice', mocked_random_choice):

            a0 = policy.suggest_action_for_state('s1')

            # Result is determined by the mocked "random" choice
            self.assertEqual('a2', a0)

            # Arguments of the mocked "random" choice should be available actions
            mocked_random_choice.assert_called_with(['a1', 'a2', 'a3'])
Exemplo n.º 6
0
def test_warehouse_02():
    env = make_test_warehouse_env_01()
    expected_value = None
    policy = RandomPolicy()
    we.execute(env, policy)
    print('**' * 30)

    print('[Result]')
    print('Finish time clock value=', env.finish_time_clock,
          ':uncompleted orders=', len(env.available_orders))
Exemplo n.º 7
0
    def test_value_for(self):
        planner_policy = MagicMock()
        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Evaluation of a state-action pair should be the same as for the qtable policy.
        policy.initialize_state('s1', {'a1', 'a2'})
        policy.update('s1', 'a2', -1.23)
        self.assertEqual(-1.23, policy.value_for('s1', 'a2'))
Exemplo n.º 8
0
def get_dataset(n_trajectories=100,
                len_trajectories=1000,
                policy=RandomPolicy(),
                list_of_traj=False,
                verbose=True,
                env=None):
    """ Generate a dataset for FQI."""

    X = []
    X_next = []
    Y = []

    start_time = time.time()

    for j in range(n_trajectories):
        if verbose and j % (math.ceil(n_trajectories / 10)) == 0:
            remaining_iterations = n_trajectories - j
            elapsed_time = time.time() - start_time
            remaining_time = 0
            if j > 0:
                remaining_time = elapsed_time / j * remaining_iterations
            print(
                "Dataset generated at {}%, elapsed time {:.0f}s, remaining time {:.0f}s"
                .format(int(j / n_trajectories * 100), elapsed_time,
                        remaining_time))

        traj, rewards, next = Agent.generate_trajectory(len_trajectories,
                                                        policy=RandomPolicy(),
                                                        stop_at_terminal=False,
                                                        env=env)

        if list_of_traj:
            X.append(traj)
            X_next.append(next)
            Y.append(rewards)
        else:
            X.extend(traj)
            X_next.extend(next)
            Y.extend(rewards)

    return [np.array(X), np.array(Y), np.array(X_next)]
Exemplo n.º 9
0
    def test_update(self):

        planner_policy = MagicMock()
        qtable_policy = MagicMock(spec=QTablePolicy())
        random_policy = MagicMock(spec=RandomPolicy())
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Updating the policy should update the qtable policy as well.
        policy.initialize_state('s1', {'a1', 'a2'})
        policy.update('s1', 'a2', -1.23)
        qtable_policy.update.assert_called_with('s1', 'a2', -1.23)
Exemplo n.º 10
0
    def test_new_state(self):

        policy = RandomPolicy()

        self.assertTrue(policy.is_new_state(state='s1'))

        policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'})
        self.assertFalse(policy.is_new_state('s1'))
Exemplo n.º 11
0
 def get_policy(self):
     if self.policy_name == "random":
         print("Policy: Random")
         policy = RandomPolicy(self.devices, self.app, self.emulator_path,
                               self.android_system, self.root_path,
                               self.pro_click, self.pro_longclick,
                               self.pro_scroll, self.pro_edit,
                               self.pro_naturalscreen, self.pro_leftscreen,
                               self.pro_back, self.pro_splitscreen,
                               self.pro_home)
     else:
         print("No valid input policy specified. Using policy \"none\".")
         policy = None
     return policy
Exemplo n.º 12
0
    def test_optimal_value_for(self):

        planner_policy = MagicMock()
        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()
        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        # Evaluation of a state-action pair should be the same as for the qtable policy.
        policy.initialize_state('s', {'a', 'b', 'c'})
        policy.update('s', 'a', 1.23)
        policy.update('s', 'b', -5.43)
        policy.update('s', 'c', 0.03)
        self.assertEqual(1.23, policy.optimal_value_for('s'))
Exemplo n.º 13
0
    def test_is_new_state(self):

        qtable_policy = QTablePolicy()
        random_policy = RandomPolicy()

        mdp_builder = VacuumCleanerWorldBuilder()
        mdp = mdp_builder.build_mdp()
        planner_policy = PlannerPolicy(planning_horizon=1,
                                       mdp_builder=mdp_builder)

        policy = PlanningExploringStartsPolicy(planner_policy, random_policy,
                                               qtable_policy)

        self.assertTrue(policy.is_new_state(state='s1'))
        policy.initialize_state(state='s1', available_actions={'a(1)', 'a(2)'})
        self.assertFalse(policy.is_new_state('s1'))
Exemplo n.º 14
0
def test_warehouse_random(order_count, max_iteration):
    start = time.time()

    #반복수행
    best = 99999999999
    for i in range(max_iteration):
        env = make_test_warehouse_env(order_count)  #order 60개기준
        policy = RandomPolicy()
        we.execute(env, policy)
        if best > env.finish_time_clock:
            best = env.finish_time_clock
        #print('Finish time clock value=', env.finish_time_clock,':uncompleted orders=',len(env.available_orders))

    print('[Result] RandomPolicy')
    print("Random Best=", best)
    end = time.time()
    print('time', (end - start))
Exemplo n.º 15
0
    def generate_trajectory(iterations,
                            policy=RandomPolicy(),
                            stop_at_terminal=True,
                            env=None):
        """Generate a trajectory following the policy of the agent"""

        init_state = env.reset(
        )  # should return a state vector if everything worked

        trajectory = []
        rewards = []
        x_next = []

        agent = Agent(env, init_state, policy)

        curr_state = init_state

        for _ in range(iterations):
            action = policy.get_action(agent)
            obs, reward, done, _ = env.step([action])

            # Current state
            t = [action]
            t.extend(curr_state)
            trajectory.append(t)

            # Reward
            if done:
                reward = reward - 10
            rewards.append(reward)

            # Next state
            x_next.append(obs)

            if stop_at_terminal == True and done == True:
                break

        return [trajectory, rewards, x_next]
Exemplo n.º 16
0
from game import GameConfig
from policy import RandomPolicy
from data.sql import store_record as sql_store
from data.sql import close as sql_close
from play import play_games

###########################
### Configuration to change
###########################

STORE_IN_SQL = True

###########################
### Initialize the env
###########################

config = GameConfig()
print(config)

writer = sql_store if STORE_IN_SQL else None

players = lambda b: [RandomPolicy(b, 'b'), RandomPolicy(b, 'w')]

play_games(config, players=players, writer=writer)

sql_close()

def main():

    num_episodes = 5000
    steps_per_episode = 200

    epochs = 100
    batch_size = 100
    render = False

    tensorboard_callback_reward = keras.callbacks.TensorBoard(
        log_dir="logs/scalars/rewards" +
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
    tensorboard_callback_transitions = keras.callbacks.TensorBoard(
        log_dir="logs/scalars/transitions" +
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))

    env = gym.make('Trajectory-v0')
    episodes = collect_data(env, num_episodes, steps_per_episode,
                            RandomPolicy(env), render)
    # transitions, rewards = prepare_datasets(episodes)
    transitions_lstm, rewards_lstm = prepare_datasets_lstm(episodes)

    validation_episodes = collect_data(env, num_episodes, steps_per_episode,
                                       RandomPolicy(env), render)
    validation_transitions_lstm, validation_rewards_lstm = prepare_datasets_lstm(
        validation_episodes)

    # transition_net = keras.Sequential([
    #     keras.layers.Input(shape=(3+env.num_observables, env.num_dimensions)),
    #     keras.layers.Flatten(),
    #     keras.layers.Dense(32, activation='relu'),
    #     keras.layers.Dense(32, activation='relu'),
    #     keras.layers.Dense((2+env.num_observables)*env.num_dimensions),
    #     keras.layers.Reshape((2+env.num_observables, env.num_dimensions))
    # ])
    # transition_net.compile(optimizer='adam', loss='mse')

    # reward_net = keras.Sequential([
    #     keras.layers.Input(shape=(3+env.num_observables, env.num_dimensions)),
    #     keras.layers.Flatten(),
    #     keras.layers.Dense(32, activation='relu'),
    #     keras.layers.Dense(32, activation='relu'),
    #     keras.layers.Dense(1)
    # ])
    # reward_net.compile(optimizer='adam', loss='mse')

    transition_net_lstm = keras.Sequential([
        keras.layers.Input(shape=(steps_per_episode, 3 + env.num_observables,
                                  env.num_dimensions)),
        keras.layers.Reshape((steps_per_episode,
                              (3 + env.num_observables) * env.num_dimensions)),
        keras.layers.LSTM(32, return_sequences=False),
        keras.layers.Dense((2 + env.num_observables) * env.num_dimensions),
        keras.layers.Reshape((2 + env.num_observables, env.num_dimensions))
    ])
    transition_net_lstm.compile(optimizer='adam', loss='mse')

    reward_net_lstm = keras.Sequential([
        keras.layers.Input(shape=(steps_per_episode, 3 + env.num_observables,
                                  env.num_dimensions)),
        keras.layers.Reshape((steps_per_episode,
                              (3 + env.num_observables) * env.num_dimensions)),
        keras.layers.LSTM(32, return_sequences=False),
        keras.layers.Dense(1)
    ])
    reward_net_lstm.compile(optimizer='adam', loss='mse')

    transition_net_lstm.fit(
        transitions_lstm[0],
        transitions_lstm[1][:, -1],
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(validation_transitions_lstm[0],
                         validation_transitions_lstm[1][:, -1]),
        callbacks=[tensorboard_callback_transitions])
    reward_net_lstm.fit(rewards_lstm[0],
                        rewards_lstm[1][:, -1],
                        epochs=epochs,
                        batch_size=batch_size,
                        validation_data=(validation_rewards_lstm[0],
                                         validation_rewards_lstm[1][:, -1]),
                        callbacks=[tensorboard_callback_reward])
Exemplo n.º 18
0
import sys
import os.path
sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir))

import argparse
import numpy as np
#from gym import wrappers
import gym

from memory import ReplayMemory
from policy import EpsilonPolicy, RandomPolicy
from agent import DQNAgent
from processor import VoidProcessor
from model import FullyConnectedModel

parser = argparse.ArgumentParser()
args = parser.parse_args()

env = gym.make('CartPole-v1')
num_actions = env.action_space.n

model = FullyConnectedModel(num_actions=num_actions, neurons_per_layer=5, num_layers=2, learning_rate=0.002, load_weights_file=None)

memory = ReplayMemory(maxlen=10000, game_over_bias=10)
processor = VoidProcessor()

policy = RandomPolicy()
dqn = DQNAgent(env=env, memory=memory, policy=policy, model=model, discount_rate=0.99, processor=processor)
dqn.play(delay=0.2)
 def temp_replace_policy(self):
     if self.run_type is RunType.RAND_FILL:
         self.agent.currently_used_policy = RandomPolicy()
     if self.run_type is RunType.TEST:
         self.agent.currently_used_policy = GreedyPolicy()
Exemplo n.º 20
0
def FQI(possible_actions, iterations, verbose=True, gamma=0.99, env=None):
    """ FQI algorithm. Take as input a Learning set (X, Y, and X_next) and the name of the model to use"""

    model = ExtraTreesRegressor(50)
    # Y_0 = Y

    start_time = time.time()

    alive_times = []
    rewards_means = []

    policy = RandomPolicy()
    X, Y, X_next = dataset_util.get_dataset(100,
                                            200,
                                            list_of_traj=False,
                                            env=env,
                                            policy=policy,
                                            verbose=False)

    for j in range(iterations):

        Y_0 = Y

        model.fit(X, Y)

        # Update Y
        Y = []
        for i, x_next in enumerate(X_next):
            to_predict = np.array(
                list(
                    map(lambda u: np.concatenate(([u], x_next)),
                        possible_actions)))

            max_prediction = max(model.predict(to_predict))

            Y.append(Y_0[i] + gamma * max_prediction)

        j = j + 1

        policy = OptimalPolicyDiscrete(possible_actions, model)

        # Testing
        alive = []
        rews = []
        for k in range(128):
            init_state = env.reset()
            agent = Agent(env, init_state)
            agent.policy = policy

            done = False
            reward = 0
            decay = 1
            steps = 0
            while (done == False):
                _, r, done = agent.step()
                reward += r * decay
                decay *= 0.99
                steps += 1

            alive.append(steps)
            rews.append(reward)

        alive_times.append(np.mean(alive))
        rewards_means.append(np.mean(rews))

        # Printing for verbose mode
        if verbose:
            remaining_iterations = iterations - j
            elapsed_time = time.time() - start_time
            remaining_time = 0
            if j > 0:
                remaining_time = elapsed_time / j * remaining_iterations
            print(
                "Fit {}, elapsed time {:.0f}s, remaining time {:.0f}s, alive steps = {:.1f}, reward = {:.1f}"
                .format(j, elapsed_time, remaining_time, alive_times[-1],
                        rewards_means[-1]))

    return [model, alive_times, rewards_means]
    return pred_gp_mean, pred_gp_variance, rollout_gp, pred_gp_mean_trajs, pred_gp_variance_trajs, rollout_gp_trajs


if __name__ == '__main__':
    import matplotlib.pyplot as plt
    plt.style.use('ggplot')
    from cartpole_sim import CartpoleSim
    from policy import SwingUpAndBalancePolicy, RandomPolicy
    from visualization import Visualizer
    import cv2

    vis = Visualizer(cartpole_length=1.5,
                     x_lim=(0.0, DELTA_T * NUM_DATAPOINTS_PER_EPOCH))
    swingup_policy = SwingUpAndBalancePolicy('policy.npz')
    random_policy = RandomPolicy(seed=12831)
    sim = CartpoleSim(dt=DELTA_T)

    # Initial training data used to train GP for the first epoch
    init_state = np.array([0.01, 0.01, np.pi * 0.5, 0.1]) * rng.randn(4)
    ts, state_traj, action_traj = sim_rollout(sim, random_policy,
                                              NUM_DATAPOINTS_PER_EPOCH,
                                              DELTA_T, init_state)

    delta_state_traj = state_traj[1:] - state_traj[:-1]

    train_x, train_y = make_training_data(state_traj[:-1], action_traj,
                                          delta_state_traj)

    for epoch in range(NUM_TRAINING_EPOCHS):
        vis.clear()
Exemplo n.º 22
0
                        type=bool,
                        default=True,
                        help="True for density sensor, false for minimum")
    return parser.parse_args()


def joint_action(policies, joint_input):
    return [f(x).data.numpy() for f, x in zip(policies, joint_input)]


if __name__ == "__main__":
    arglist = parse_args()
    domain = Task_Rovers(arglist)
    obs = domain.reset()

    policy = RandomPolicy(output_shape=2, low=-1, high=1)
    policies = [policy.get_next() for _ in range(arglist.num_rover)]

    networks = [Evo_MLP(12, 2) for _ in range(arglist.num_rover)]
    policies = [net.get_next() for net in networks]
    updates = [net.get_evo() for net in networks]

    obs = domain.reset()
    for _ in range(arglist.num_timestep):
        print("Step")
        action = joint_action(policies, obs)
        print(action)
        for f in updates:
            f()
        action = joint_action(policies, obs)
        print(action)
Exemplo n.º 23
0
import data_utils
from mdp import MDP
from rewards import reward_func_linear  # Call it with stats to initialize
from env import Env
from q_learning import QLearningAlgo
from policy import EpsilonGreedyPolicy, GreedyPolicy, RandomPolicy

data = data_utils.Data(n=15)
mdp = MDP(data=data)
reward_func = reward_func_linear(data.statistics, verbose=False)
env = Env(reward_func=reward_func, mode='human')
# policy = EpsilonGreedyPolicy(action_space = mdp.action_space)
policy = RandomPolicy(action_space=mdp.action_space)
test_policy = GreedyPolicy(action_space=mdp.action_space)
algo = QLearningAlgo(env=env, mdp=mdp, policy=policy, discount=0.2)

algo.set_mode('train')
algo.fit(mode='train', epochs=4, remember=True)

algo.set_mode('test')
algo.test(mode='test', policy=test_policy)

algo.replay(batch_size=16, epochs=8)

algo.set_mode('test')
algo.test(mode='test', policy=test_policy)

# algo.test(mode = 'human', policy = test_policy)

import numpy as np
import matplotlib.pyplot as plt
Exemplo n.º 24
0
        goal = tuple(np.random.randint(0, 10, [2]))
    return goal


def plot(rewards_trails, color):
    for count in range(10):
        plt.plot(np.arange(10000), rewards_trails[count], linestyle='dotted')
    line, = plt.plot(np.arange(10000),
                     np.average(rewards_trails, axis=0),
                     linestyle='solid',
                     color=color)
    return line


manual_agent()
random_policy_rewards = non_learning_agent(RandomPolicy())
worse_policy_rewards = non_learning_agent(WorsePolicy())
better_policy_rewards = non_learning_agent(BetterThanRandomPolicy())
line1 = plot(random_policy_rewards, 'red')
line2 = plot(worse_policy_rewards, 'green')
line3 = plot(better_policy_rewards, 'blue')
plt.ylabel("Cumulative reward")
plt.xlabel("Steps")
plt.legend([line1, line2, line3], ['random', 'worse', 'better'])
plt.savefig('qt4.png')
plt.show()

random_goal = get_random_goal()
print(f"New goal is {random_goal}")
random_policy_rewards = non_learning_agent(RandomPolicy(), random_goal)
worse_policy_rewards = non_learning_agent(WorsePolicy(), random_goal)