def test_policy(): # Equal probability for each of the actions theta = np.array([[1, 1, 1], [1, 1, 1], [1, 1, 1]], dtype=float) # Think rock. Probability of selecting rock p = policy(State.ROCK_1, Action.ROCK, theta) assert p == 1/3 # When predicting rock, always play paper theta = np.array([[float("-inf"), 1, float("-inf")], [1, 1, 1], [1, 1, 1]], dtype=float) p = policy(State.ROCK_1, Action.ROCK, theta) assert p == 0 p = policy(State.ROCK_1, Action.PAPER, theta) assert p == 1
def inst2string(time, aspath, node): retval = [time] for n in nodes: if n != node: retval.append('\"\"') else: r = Route('d', {'AS_PATH': aspath}) pref = policy(node, r) retval.append(bin(pref)[2:]) return retval
def decisionProcess(self, now, update=None): # HARD CODING!!! so che ho solo una destinazione nella RT prefix = update[1].prefix if update else list( self.RT.adjRIBin.keys())[0] if update: if prefix not in self.RT.adjRIBin: len_adj_before = 0 else: len_adj_before = len(list(self.RT.adjRIBin[prefix].items())) self.RT.update_adjRIBin(update) len_adj_after = len(list(self.RT.adjRIBin[prefix].items())) fromWho = update[0] # Phase 1,2: compute preferences, then select&install the best best_rt, learned_by, max_pref, = None, None, float('-inf') for sender, route in self.RT.adjRIBin[prefix].items(): rt_preference = policy(self.ID, route) if rt_preference > max_pref: best_rt, learned_by, max_pref = route, sender, rt_preference if (learned_by, best_rt) == update: PROCESSING_RESULT = "NEW_BEST_PATH" elif len_adj_after > len_adj_before: PROCESSING_RESULT = "NEW_PATH" elif len_adj_after == len_adj_before: PROCESSING_RESULT = "REMOVED_REPLACED_PATH" else: PROCESSING_RESULT = "NONE" old_best = self.RT[best_rt.prefix][ 'AS_PATH'] if not best_rt.prefix not in self.RT else "NONE" self.RT.install_route(best_rt, learned_by, max_pref, now) new_best_path = self.RT[best_rt.prefix]['AS_PATH'] tim = self.start_time + datetime.timedelta(0, now) time = tim.strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] if len(update[1].as_path()) > 0 and update[1].as_path() != 'P': self.log2(time + " <FATAL> {type: UPDATE_RX, dest: " + str(best_rt.prefix).split('/')[0] + ", from: " + str(fromWho) + ", nh: " + str(fromWho) + ", as_path: " + str(update[1].as_path()).replace(',', '|') + ", previus_best_path: " + str(old_best).replace(',', '|') + ", actual_best_path: " + str(new_best_path).replace(',', '|') + ", processing: " + PROCESSING_RESULT + "}\n") else: # self.log2("decision process without update\n") pass self.disseminate(prefix, now)
def __init__(self,batch_update_size = C.UPDATE_SIZE): self.HIDDEN_SIZE = C.HIDDEN_SIZE self.DEVICE = 'cpu' self.model = policy.policy(observation_space_size=C.observation_space_size, action_space_size=C.action_space_size, hidden_size=self.HIDDEN_SIZE).to(self.DEVICE) self.lock = threading.Lock() self.future_model = torch.futures.Future() self.batch_update_size = batch_update_size self.curr_update_size = 0 self.current_rewards = [] self.optimizer = optim.Adam(params=self.model.parameters(), lr=C.ALPHA) for p in self.model.parameters(): p.grad = torch.zeros_like(p)
def newtickets(self, principle, subject, args): import policy, ticket ticketsdata = [] try: self.agent.rlock.acquire() allips = self.agent.gmetadthr.getips() slice = args[0]["slice"].strip() ntickets = args[0]["ntickets"] leaselen = args[0]["leaselen"] ips = args[0]["ips"] pol = policy.policy(allips) self.newtickets_checkargs(slice, ntickets, leaselen, self.agent, pol, ips) for ip in pol.getips(ntickets, ips): t = self.tfactory.createticket(principle, ip, slice, leaselen) self.agent.dbthr.addticket(t.data, subject) ticketsdata.append(t.data) finally: self.agent.rlock.release() return ticketsdata
def play_tictactoe(P, theta, d, n): total_reward = 0 for i in range(n): # Choose the starting state s = sample_from_dist(d) state = State(s) # Get the action action_probs = np.zeros((3, )) for action in Action: action_probs[action.value] = policy(state, action, theta) a = sample_from_dist(action_probs) action = Action(a) # Get the transition state s2 = sample_from_dist(P[s]) state2 = State(s2 + 3) # Get the reward reward = get_reward(state2, action) total_reward += reward return total_reward
from uav_track_env import Env from policy import policy import time if __name__ == '__main__': env = Env() success_cnt = 0 ep = 10 for i in range(ep): print("ep:" + str(i)) s = env.reset() step_cnt = 0 while True: if env.terminal_state(): print('success') success_cnt = success_cnt + 1 break print('t0:' + str(time.time())) action = policy(s) print('t1:' + str(time.time())) s = env.step_for_queue(action) print('t2:' + str(time.time())) step_cnt = step_cnt + 1 print('t3:' + str(time.time())) if step_cnt > 200: break print('finish')
# -------------------------------------------------------------------------------- # Training loop # -------------------------------------------------------------------------------- # We are going to store in memory the trained weights, at the end of each episode. lw = [] lphi0 = [] for _ in tqdm(range(N_EPISODES)): obs = env.reset().reshape(1, -1) lphi = [] lrew = [] G = 0 # During the episode we store the states we visit and the rewards we get. for i in range(LEN_EPISODE): action = policy.policy(obs, torque_value=args.torque_value) phi = tiles.encode(obs, tiles_intervals) lphi.append(phi) next_obs, rew, done, _ = env.renv.step(action) lrew.append(rew) next_obs = next_obs.reshape(1, -1) obs = next_obs # Now we update the weights for each transition for t in reversed(range(LEN_EPISODE)): G = GAMMA * G + lrew[t] w = algorithms.update_mc(w, lphi[t], G, v, g, alpha) # Logging lw.append(w) lphi0.append(v(phi_0, w))
import slider import policy import time import torch import memory import random import math import torch.nn.functional as F import nstep agent = policy.policy() lagged_agent = policy.policy() lagged_agent.copy_weights(agent) replay_memory_size = 100000 replay_memory = memory.ReplayMemory(replay_memory_size) # export OMP_NUM_THREADS=1 def live(iterations, batch_size, lagg, eps, improve_flag, num_steps): n_step = nstep.Nstep(num_steps) g = slider.Game() state = g.get_state() total_reward = 0 start = time.time() for i in range(iterations): # eps-greedy if random.uniform(0,1) < eps: action = random.randint(0,3) else:
import os import platform import pathlib import pandas as pd import time current_file_path = pathlib.Path(__file__).parent.absolute() sys.path.insert(1, os.path.join(current_file_path, 'build')) from myBinds import Domain, Cell import numpy as np import torch from pympler import asizeof sys.path.insert(1, os.path.join(current_file_path, '..', 'Pyy')) from policy import policy policy_model = policy() policy_model.load_state_dict( torch.load(os.path.join(current_file_path, '..', 'Pyy', 'policy.pt'))) policy_model.eval() class pyCell(Cell): NN = policy_model def __init__(self, env, class_name): #can go super().__init__(env, class_name) self.age = randrange(0, 100) self.setup() def setup(self): NN_value = pyCell.NN.forward(self.age)
from graphics import * import numpy as np import policy import torch import time import copy from multiprocessing import Pool, cpu_count from slider import * from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler ncpus = cpu_count() width = 1500 height = 1000 agent = policy.policy(Game.state_space_size, Game.action_space_size) optimizer = torch.optim.Adam(agent.parameters(), lr=3e-4, eps=1e-5) device = "cuda:0" if torch.cuda.is_available() else "cpu" print("DEVICE:", device) agent = agent.to(device) def getEpisode(n, agent): game = Game(width, height) states = np.zeros((n, game.state_space_size)) actionprobs = [] actions = np.zeros((n, game.action_space_size)) values = [] rewards = np.zeros(n)