示例#1
0
def run_random(stochastic=False, noisy=False, problem_id=0):
    if stochastic:
        env = virl.Epidemic(stochastic=True, noisy=noisy)
    else:
        env = virl.Epidemic(stochastic=stochastic,
                            noisy=noisy,
                            problem_id=problem_id)
    states = []
    rewards = []
    done = False
    s = env.reset()
    states.append(s)
    while not done:
        #s, r, done, i = env.step(action=0) # deterministic agent
        s, r, done, i = env.step(action=np.random.choice(env.action_space.n))
        states.append(s)
        rewards.append(r)
    if stochastic:
        print('Stochastic=Ture, with Noisy=' + str(noisy) + ',rewards=' +
              str(sum(rewards)))
    else:
        print('Problem ' + str(problem_id) + ' reward sum: ' +
              str(sum(rewards)))
    #Generate pictures
    '''
示例#2
0
def run_qtable(stochastic=False, noisy=False, id=0, num_episodes=20):
    global rw
    global sss
    rw = []
    #Build Q-table
    table = build_q_table(ACTIONS)
    print("Q-Learning training")
    #Initial env
    if stochastic:
        env = virl.Epidemic(stochastic=True, noisy=noisy)
    else:
        env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=id)
    rewards = []

    for episode in range(num_episodes):
        states = []
        rewards = []
        done = False
        s = env.reset()
        states.append(s)
        while not done:
            a = choose_action(s, table)
            s_, R_, done, i = env.step(action=a)
            #Training
            qlearning_train(s, a, R_, s_, done, table)
            s = s_
            states.append(s)
            rewards.append(R_)
            sss = states
    env.close()
示例#3
0
def run_pg(stochastic=True, noisy=True, problem_id=0, episodes=20):

    if stochastic:
        env = virl.Epidemic(stochastic=True, noisy=noisy)
        if noisy:
            nl = 'nt'
        else:
            nl = 'nf'
        label = 'stochastic_{}'.format(nl)
    else:
        env = virl.Epidemic(stochastic=stochastic,
                            noisy=noisy,
                            problem_id=problem_id)
        if noisy:
            nl = 'nt'
        else:
            nl = 'nf'
        label = 'problem{}_{}'.format(problem_id, nl)
    # get the agent which is control by the policy gradient
    state = env.reset()
    agent = policyGradient(actions=env.actions, stateSize=len(state), seed=1)
    rewards = []
    states = []
    for i in range(episodes):
        # get the environment of task
        state = env.reset()
        while True:
            # RL choose action based on observation
            action = agent.getAction(state)
            # RL take action and get next observation and reward
            state_, reward, done, info = env.step(action)
            if i == episodes - 1:
                rewards.append(reward)
                states.append(state)
            # store the data tuple (s,a,r) and train
            agent.storeTransition(state, action, reward)
            # update the current state using observation
            state = state_
            # task is over, and begin new one
            if done:
                break
        agent.train()

    print(sum(rewards))
    # return rewards,states
    draw_pic(agent.lossList, 'iter', 'loss',
             'PG_loss{}_{}.png'.format(episodes, label))
    draw_pic(rewards, 'iter', 'reward',
             'PG_reward_{}_{}.png'.format(episodes, label))
    draw_state(states, 'PG_state_{}_{}.png'.format(episodes, label))
def get_all_problems_fig():
    pic_dir = './results/Deterministic'
    for noisy in [True, False]:
        for act in range(4):
            filename = os.path.join(
                pic_dir,
                'all_problems_noisy={} action={}.png'.format(noisy, act))
            if os.path.exists(filename):
                continue
            fig, ax = plt.subplots(figsize=(8, 6))
            for i in range(10):
                env = virl.Epidemic(problem_id=i, noisy=noisy)
                states = []
                rewards = []
                done = False
                s = env.reset()
                states.append(s)
                while not done:
                    s, r, done, info = env.step(
                        action=act)  # deterministic agent
                    states.append(s)
                    rewards.append(r)
                ax.plot(np.array(states)[:, 1], label=f'problem_id={i}')
            ax.set_xlabel('weeks since start of epidemic')
            ax.set_ylabel('Number of Infectious persons')
            ax.set_title(
                'Simulation of problem_ids with action {}'.format(act))
            ax.legend()
            plt.savefig(dpi=300, fname=filename)
def run_qlnn(stochastic=False, noisy=False, id=0):
    # Setting environment parameters
    if stochastic:
        env = virl.Epidemic(stochastic=True, noisy=noisy)
        print('env stochastic=' + str(stochastic) + '/noisy=' + str(noisy))
    else:
        env = virl.Epidemic(stochastic=stochastic, noisy=noisy, problem_id=id)
        print('env stochastic=' + str(stochastic) + '/noisy=' + str(noisy) +
              '/problem_id=' + str(id))

    d_states = env.observation_space.shape[0]
    n_actions = env.action_space.n
    print('inital function approximator with learning rate:' + str(alpha) +
          ' ')
    # Init the two networks
    nn_func_approximator = NNFunctionApproximatorJointKeras(
        alpha, d_states, n_actions, nn_config)
    nn_func_approximator_target = NNFunctionApproximatorJointKeras(
        alpha, d_states, n_actions, nn_config)
    # Training
    print('Training>>>')
    stats = q_learning_nn(env,
                          nn_func_approximator,
                          nn_func_approximator_target,
                          20,
                          max_steps_per_episode=52,
                          epsilon_init=0.1,
                          epsilon_decay=0.995,
                          epsilon_min=0.001,
                          fn_model_in=None,
                          fn_model_out="temp.h5")
    print('Training done!')
    print('Testing>>>')
    nn_func_approximator.alpha = 0.0
    epsilon_fixed = 0.1
    stats_show = q_learning_nn(env,
                               nn_func_approximator,
                               nn_func_approximator_target,
                               1,
                               max_steps_per_episode=52,
                               epsilon_init=epsilon_fixed,
                               epsilon_decay=1.0,
                               epsilon_min=epsilon_fixed,
                               show=True,
                               fn_model_in="temp.h5")
    print('Test done!')
示例#6
0
def get_env():
    for i in range(10):
        for j in range(2):
            if j:
                noisy = 'True'
            else:
                noisy = 'False'
            problem_id = i
            yield problem_id, noisy, virl.Epidemic(problem_id=i, noisy=j)
示例#7
0
def evaluate(policy, problem_id=0, full_eval=False, verbose=True, noisy=False):
    """
    Evaluate a policy

    :param policy a callable that, given in input a state, returns an action
    :param full_eval whether to fully evaluate the policy on all the problems or the first problem only
    :param verbose whether to get verbose output
    :param noisy whether to simulate a noisy environment
    """
    #trained_policy = create_policy(approximator_dl, 0, 4)

    if not full_eval:
        limit = 1
        envs = [virl.Epidemic(problem_id=problem_id, noisy=noisy)]
    else:
        limit = 10
        envs = [virl.Epidemic(problem_id=i, noisy=noisy) for i in range(limit)]

    fig, axes = plt.subplots(limit, 2, figsize=(20, 8 * limit))

    total_rewards = []

    for i, env in enumerate(envs, start=problem_id):
        states, rewards, action_taken = execute_policy(policy, env)
        if verbose:
            print(i, action_taken)
        # small hack to change the first key from i to 0
        if limit == 1:
            axes_wrapper = [axes[0], axes[1]]
        else:
            axes_wrapper = axes[i]
        plot(states, rewards, action_taken, axes=axes_wrapper, problem_id=i)
        total_rewards.append(sum(rewards))

    if limit > 1:
        _, ax = plt.subplots(1, 1, figsize=(10, 4))
        ax.bar(np.arange(limit), total_rewards)
        ax.set_xticks(np.arange(limit))
        return total_rewards
示例#8
0
def train(lr=0.01, n_episodes=50):
    print('qlearning nn training...')
    env = virl.Epidemic()
    n_actions = env.action_space.n
    n_states = env.observation_space.shape[0]

    policy_estimator = NeuralNetwork(env, n_states, n_actions, lr=lr)
    stats = implement(env, policy_estimator, n_episodes, discount_factor=0.95)
    results_dir = './results/qlearning_nn'
    if not os.path.exists(results_dir):
        os.makedirs(results_dir)
    pkl_file = os.path.join(
        results_dir,
        'qlearning_nn_lr={}_episodes={}.pkl'.format(lr, n_episodes))
    with open(pkl_file, 'wb') as f:
        pickle.dump(policy_estimator, f)
    return policy_estimator
示例#9
0
def policy_greedy(state) -> np.array:
    def eval_reward(state, action):
        policy_severity_factor = 1e11
        a = state[1] + state[2]
        b = (1 - action)

        expected_a = a * (1 + action - 0.1)
        val = (-expected_a - expected_a**2 - policy_severity_factor * b -
               policy_severity_factor * b**2) / policy_severity_factor

        return val

    env = virl.Epidemic()

    greedy_rewards = np.array([eval_reward(state, a) for a in env.actions])
    action_id = np.argmax(greedy_rewards)

    action_proba = [0.0] * 4
    action_proba[action_id] = 1.0

    return action_proba
示例#10
0
def evaluate_stochastic(policy, num_tries=10, noisy=True):
    """
    Evaluate a policy in a stochastic environment.

    horribly copied from generate_readme_plots.ipynb

    :param policy a callable that returns a probability distribution of probabilities
    :param num_tries the number of tries to perform

    """

    fig, ax = plt.subplots(figsize=(8, 6))
    for i in range(num_tries):
        env = virl.Epidemic(stochastic=True, noisy=noisy)
        states, rewards, actions_taken = execute_policy(policy, env)
        ax.plot(np.array(states)[:, 1], label=f'draw {i}')
    ax.set_xlabel('weeks since start of epidemic')
    ax.set_ylabel('Number of Infectious persons')
    ax.set_title(
        f'Simulation of {num_tries} stochastic episodes without intervention')
    ax.legend()
示例#11
0
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import os
os.chdir('..')
from matplotlib import pyplot as plt
import numpy as np
import virl

#set environment

env = virl.Epidemic(stochastic=False,
                    noisy=False,
                    problem_id=(np.random.choice(10)))

states = []
rewards = []
done = False

s = env.reset()
states.append(s)

#set qlearnmethod

while not done:

    s, r, done, i = env.step(action=np.random.choice(env.action_space.n))
    states.append(s)
    rewards.append(r)
    print(rewards)
示例#12
0
import virl
import numpy as np
from agents import DeterministicAgent, RandomAgent

from matplotlib import pyplot as plt

env = virl.Epidemic(stochastic=False, noisy=False)
agent = DeterministicAgent(env)
states = []
rewards = []
done = False

s = env.reset()
states.append(s)
while not done:
    s, r, done, i = env.step(action=agent.get_action())
    states.append(s)
    rewards.append(r)

states = np.array(states)
rewards = np.array(rewards)

plt.plot(rewards)

print(states[:, 0][0:4])
print(states[:, 1][0:4])
print(states[:, 2][0:4])
print(states[:, 3][0:4])

print("rewards " + str(rewards[:4]))
示例#13
0
def QLtest(stochastic=False, noisy=False, problem_id=0, num_episodes=20):

    k = problem_id
    print(stochastic, noisy)
    time_start = time.time()
    run_qtable(stochastic=stochastic,
               noisy=noisy,
               id=problem_id,
               num_episodes=20)
    time_end = time.time()
    print('QL totally cost', time_end - time_start)
    #Test Q-learning
    #Set the parameters
    if stochastic:
        env = virl.Epidemic(stochastic=True, noisy=noisy)
    else:
        env = virl.Epidemic(stochastic=stochastic,
                            noisy=noisy,
                            problem_id=problem_id)
    states = []
    rewards = [0]
    done = False
    s = env.reset()
    states.append(s)
    ac = []
    #Call the trained Q-table
    if stochastic:
        qtable = pd.read_csv(r'Qtable_stochastic.csv')
    else:
        qtable = pd.read_csv(r'Qtable' + str(k) + '.csv')
    #Testing
    while not done:
        a = choose_action(s, qtable)
        ac.append(a)
        s_, R_, done, i = env.step(action=a)
        s = s_
        states.append(s)
        rewards.append(R_)
    if stochastic:
        #table.to_csv(r'Qtable_stochastic.csv', index=0)
        print('Stochastic=Ture, with Noisy=' + str(noisy) + ',rewards=' +
              str(sum(rewards)))
    else:
        #table.to_csv(r'Qtable'+str(id)+'.csv',index=0)
        print('Problem ' + str(id) + ' reward sum: ' + str(sum(rewards)))
    #Generate pictures
    plt.figure(1)
    states = np.array(states)
    labels = ['susceptibles', 'infectious', 'quarantined', 'recovereds']
    x = np.arange(0, len(states[:, 1]))
    for i in range(0, 4):
        plt.plot(x, states[:, i], label=labels[i])
    path = 'QL(Noisy)_problem_' + str(k) + '.svg'
    plt.xlabel('Weeks')
    plt.ylabel('States')
    print(ac)
    plt.legend()
    #plt.savefig(path)
    plt.figure(2)
    plt.plot(x, rewards)
    plt.xlabel('Weeks( Reward sum is: ' + str(sum(rewards).astype(float)) +
               ' )')
    plt.ylabel('Reward')
    #plt.savefig(r'QL_reward_'+str(k)+'.svg')
    plt.show()
示例#14
0
    INITIAL_EPSILON = 1  # starting value of epsilon
    FINAL_EPSILON = 0.1  # final value of epsilon
    REPLAY_SIZE = 10000  # experience replay buffer size
    BATCH_SIZE = 32  # size of minibatch
    s = np.array([0., 0, 0, 0])  # epidemic state
    c = 1.  # infection rate damping

    for i in range(1):

        try:
            episodes = int(sys.argv[1])
        except Exception as e:
            episodes = int(500)
        # is used to train several episodes,
        # random.seed(1)
        env = virl.Epidemic(noisy=False, problem_id=i)
        # define env class
        action_dim = env.action_space.n
        # state_dim = env.observation_space.n
        agent = DQN()

        # agent class in inrelated to env
        all_reward = []
        # all reward record list1
        for episode in range(episodes + 1):

            # circle episodes times
            state = env.reset()
            # restart env

            agent.epsilon -= 0.7 / episodes
    def update(self, s, a, td_target):
        """
        Updates the approximator's parameters (i.e. the weights) for a given state and action towards
        the target y (which is the TD target).
        """
        features = self.featurize_state(s)
        self.models[a].partial_fit(
            [features],
            [td_target])  # recall that we have a seperate funciton for each a


from utils import (q_learning, exec_policy, get_fig, plt)

if __name__ == '__main__':
    env = virl.Epidemic(stochastic=False, noisy=False)

    rbf_file = './rbf.pkl'
    if os.path.exists(rbf_file):
        with open(rbf_file, 'rb') as f:
            rbf_func = pickle.load(f)
        print('form file load RBF success.')
    else:
        rbf_func = RbfFunctionApproximator(env)
        # training
        states = q_learning(env, rbf_func, 1500, epsilon=0.05)
        # save the approximate function
        with open(rbf_file, 'wb') as f:
            pickle.dump(rbf_func, f)
    # make dir
    if not os.path.exists('./results/RBF'):