def test_policy():
    # Equal probability for each of the actions
    theta = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype=float)

    # Think rock. Probability of selecting rock
    p = policy(State.ROCK_1, Action.ROCK, theta)
    assert p == 1/3

    # When predicting rock, always play paper
    theta = np.array([[float("-inf"), 1, float("-inf")], [1, 1, 1], [1, 1, 1]], dtype=float)
    p = policy(State.ROCK_1, Action.ROCK, theta)
    assert p == 0
    p = policy(State.ROCK_1, Action.PAPER, theta)
    assert p == 1
示例#2
0
def inst2string(time, aspath, node):
    retval = [time]
    for n in nodes:
        if n != node:
            retval.append('\"\"')
        else:
            r = Route('d', {'AS_PATH': aspath})
            pref = policy(node, r)
            retval.append(bin(pref)[2:])
    return retval
示例#3
0
 def decisionProcess(self, now, update=None):
     # HARD CODING!!! so che ho solo una destinazione nella RT
     prefix = update[1].prefix if update else list(
         self.RT.adjRIBin.keys())[0]
     if update:
         if prefix not in self.RT.adjRIBin:
             len_adj_before = 0
         else:
             len_adj_before = len(list(self.RT.adjRIBin[prefix].items()))
         self.RT.update_adjRIBin(update)
         len_adj_after = len(list(self.RT.adjRIBin[prefix].items()))
         fromWho = update[0]
         # Phase 1,2: compute preferences, then select&install the best
         best_rt, learned_by, max_pref, = None, None, float('-inf')
         for sender, route in self.RT.adjRIBin[prefix].items():
             rt_preference = policy(self.ID, route)
             if rt_preference > max_pref:
                 best_rt, learned_by, max_pref = route, sender, rt_preference
         if (learned_by, best_rt) == update:
             PROCESSING_RESULT = "NEW_BEST_PATH"
         elif len_adj_after > len_adj_before:
             PROCESSING_RESULT = "NEW_PATH"
         elif len_adj_after == len_adj_before:
             PROCESSING_RESULT = "REMOVED_REPLACED_PATH"
         else:
             PROCESSING_RESULT = "NONE"
         old_best = self.RT[best_rt.prefix][
             'AS_PATH'] if not best_rt.prefix not in self.RT else "NONE"
         self.RT.install_route(best_rt, learned_by, max_pref, now)
         new_best_path = self.RT[best_rt.prefix]['AS_PATH']
         tim = self.start_time + datetime.timedelta(0, now)
         time = tim.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3]
         if len(update[1].as_path()) > 0 and update[1].as_path() != 'P':
             self.log2(time + " <FATAL> {type: UPDATE_RX, dest: " +
                       str(best_rt.prefix).split('/')[0] + ", from: " +
                       str(fromWho) + ", nh: " + str(fromWho) +
                       ", as_path: " +
                       str(update[1].as_path()).replace(',', '|') +
                       ", previus_best_path: " +
                       str(old_best).replace(',', '|') +
                       ", actual_best_path: " +
                       str(new_best_path).replace(',', '|') +
                       ", processing: " + PROCESSING_RESULT + "}\n")
     else:
         # self.log2("decision process without update\n")
         pass
     self.disseminate(prefix, now)
示例#4
0
    def __init__(self,batch_update_size = C.UPDATE_SIZE):


        self.HIDDEN_SIZE = C.HIDDEN_SIZE
        self.DEVICE = 'cpu'

        self.model = policy.policy(observation_space_size=C.observation_space_size,
                           action_space_size=C.action_space_size,
                           hidden_size=self.HIDDEN_SIZE).to(self.DEVICE)

        self.lock = threading.Lock()
        self.future_model = torch.futures.Future()
        self.batch_update_size = batch_update_size
        self.curr_update_size = 0
        self.current_rewards = []
        self.optimizer = optim.Adam(params=self.model.parameters(), lr=C.ALPHA)
        for p in self.model.parameters():
            p.grad = torch.zeros_like(p)
示例#5
0
 def newtickets(self, principle, subject, args):
     import policy, ticket
     ticketsdata = []
     try:
         self.agent.rlock.acquire()
         allips = self.agent.gmetadthr.getips()
         slice = args[0]["slice"].strip()
         ntickets = args[0]["ntickets"]
         leaselen = args[0]["leaselen"]
         ips = args[0]["ips"]
         pol = policy.policy(allips)
         self.newtickets_checkargs(slice, ntickets, leaselen, self.agent, pol, ips)
         for ip in pol.getips(ntickets, ips):
             t = self.tfactory.createticket(principle, ip, slice, leaselen)
             self.agent.dbthr.addticket(t.data, subject)
             ticketsdata.append(t.data)
     finally:
         self.agent.rlock.release()
     return ticketsdata
def play_tictactoe(P, theta, d, n):
    total_reward = 0
    for i in range(n):
        # Choose the starting state
        s = sample_from_dist(d)
        state = State(s)

        # Get the action
        action_probs = np.zeros((3, ))
        for action in Action:
            action_probs[action.value] = policy(state, action, theta)
        a = sample_from_dist(action_probs)
        action = Action(a)

        # Get the transition state
        s2 = sample_from_dist(P[s])
        state2 = State(s2 + 3)

        # Get the reward
        reward = get_reward(state2, action)
        total_reward += reward

    return total_reward
示例#7
0
from uav_track_env import Env
from policy import policy
import time

if __name__ == '__main__':
    env = Env()
    success_cnt = 0
    ep = 10
    for i in range(ep):
        print("ep:" + str(i))
        s = env.reset()
        step_cnt = 0
        while True:
            if env.terminal_state():
                print('success')
                success_cnt = success_cnt + 1
                break
            print('t0:' + str(time.time()))
            action = policy(s)
            print('t1:' + str(time.time()))
            s = env.step_for_queue(action)
            print('t2:' + str(time.time()))
            step_cnt = step_cnt + 1
            print('t3:' + str(time.time()))
            if step_cnt > 200:
                break
    print('finish')
示例#8
0
# --------------------------------------------------------------------------------
# Training loop
# --------------------------------------------------------------------------------

# We are going to store in memory the trained weights, at the end of each episode.
lw = []
lphi0 = []
for _ in tqdm(range(N_EPISODES)):
    obs = env.reset().reshape(1, -1)
    lphi = []
    lrew = []
    G = 0

    # During the episode we store the states we visit and the rewards we get.
    for i in range(LEN_EPISODE):
        action = policy.policy(obs, torque_value=args.torque_value)
        phi = tiles.encode(obs, tiles_intervals)
        lphi.append(phi)
        next_obs, rew, done, _ = env.renv.step(action)
        lrew.append(rew)
        next_obs = next_obs.reshape(1, -1)
        obs = next_obs

    # Now we update the weights for each transition
    for t in reversed(range(LEN_EPISODE)):
        G = GAMMA * G + lrew[t]
        w = algorithms.update_mc(w, lphi[t], G, v, g, alpha)

    # Logging
    lw.append(w)
    lphi0.append(v(phi_0, w))
示例#9
0
import slider
import policy
import time
import torch
import memory
import random
import math
import torch.nn.functional as F
import nstep

agent = policy.policy()

lagged_agent = policy.policy()
lagged_agent.copy_weights(agent)

replay_memory_size = 100000
replay_memory = memory.ReplayMemory(replay_memory_size)

# export OMP_NUM_THREADS=1
def live(iterations, batch_size, lagg, eps, improve_flag, num_steps):
    n_step = nstep.Nstep(num_steps)
    g = slider.Game()
    state = g.get_state()
    total_reward = 0
    start = time.time()
    for i in range(iterations):

        # eps-greedy
        if random.uniform(0,1) < eps:
            action = random.randint(0,3)
        else:
示例#10
0
import os
import platform
import pathlib
import pandas as pd
import time
current_file_path = pathlib.Path(__file__).parent.absolute()
sys.path.insert(1, os.path.join(current_file_path, 'build'))
from myBinds import Domain, Cell

import numpy as np
import torch
from pympler import asizeof

sys.path.insert(1, os.path.join(current_file_path, '..', 'Pyy'))
from policy import policy
policy_model = policy()
policy_model.load_state_dict(
    torch.load(os.path.join(current_file_path, '..', 'Pyy', 'policy.pt')))
policy_model.eval()


class pyCell(Cell):
    NN = policy_model

    def __init__(self, env, class_name):  #can go
        super().__init__(env, class_name)
        self.age = randrange(0, 100)
        self.setup()

    def setup(self):
        NN_value = pyCell.NN.forward(self.age)
示例#11
0
from graphics import *
import numpy as np
import policy
import torch
import time
import copy
from multiprocessing import Pool, cpu_count
from slider import *
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler

ncpus = cpu_count()
width = 1500
height = 1000
agent = policy.policy(Game.state_space_size, Game.action_space_size)
optimizer = torch.optim.Adam(agent.parameters(), lr=3e-4, eps=1e-5)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

print("DEVICE:", device)

agent = agent.to(device)


def getEpisode(n, agent):
    game = Game(width, height)

    states = np.zeros((n, game.state_space_size))
    actionprobs = []
    actions = np.zeros((n, game.action_space_size))
    values = []
    rewards = np.zeros(n)