def test_ip_reputation_feature(self): """ This method validates if the ips are updated correctly, and also if the scores are with proper values """ self.init() self.redis.delete('AI:metadata:[email protected]') for from_ipaddress in ['189.20.3.1', '200.20.3.100', '200.20.3.100', '201.20.230.10']: metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' milter_subject = 'Subject does matter' # Enable feeder feature feat.feederFeaturesEnabled = True feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) score_object = self.redis.zrange(feat.ipReputationFeatureGroupByNamespace, 0, -1, withscores=True)[0][1] self.assertTrue(len(self.redis.smembers(feat.ipReputationFeatureNamespace)) == 3 and score_object == 3)
def main(env_name, num_episodes, render, VideoSave, gamma, lam, kl_targ, batch_size): killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim, env_name) scaler.resume() val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) episode = 0 capture = False while episode < num_episodes: if VideoSave and not capture: env.ScreenCapture(5) capture = True trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def featureMCControl(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ qFunc = g_t """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(policy)) states, sFeatures, actions, rewards = [], [], [], [] state = random.choice(mdp.states) sFeature = mdp.getFeature(state) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: action = policy.epsilonGreedy(sFeature, epsilon) isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) states.append(state) sFeatures.append(sFeature) rewards.append(reward) actions.append(action) state = nextState sFeature = nextSFeature count += 1 g = 0.0 for i in range(len(states) - 1, -1, -1): g *= mdp.gamma g += rewards[i] for i in range(len(states)): policy.update(sFeatures[i], actions[i], g, alpha) g -= rewards[i] g /= mdp.gamma if echoSE: return policy, squareErrors else: return policy
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def __init__(self, name, init_x, init_y, ch, color): self.name = name _fn = "dot%s" % self.name self.backup = basic.BackUp(_fn) self.chr = ch self.color = color self.V = (0., 0., 0., 0.,) """群体的个人集合 """ self.P = [] """性别情况 """ self.male = 0 self.female = 0 """怀孕人数 """ self.mating = 0 self.policy = Policy() self.X = init_x + self.policy._func() self.Y = init_y + self.policy._func() self.load()
def main(): webStr = None queryString = None opts, args = getopt.getopt(sys.argv[1:], "i:q:", ["input", "query"]) for o, a in opts: if o == "-i": webStr = a elif o == "-q": queryString = a if webStr is None or queryString is None: print "Incorrect usage" sys.exit(-1) xsb = XSB() try: webStr = webStr.replace("<newline>", "\n") polStr = "\n".join([l for l in webStr.split("\n") if ":-" in l]) policy = Policy.fromString(escapeCharacters(polStr)) query = Atom.fromElements(Grammar.parseAtom(escapeCharacters(queryString))) policy.processPolicy() policy.checkQuery(query) xsb.loadPolicy(policy) print xsb.query(query) xsb.close() except Exception as e: print "Error:", e xsb.close() sys.exit(-1)
def __init__(self, policy, number_of_tasks=None): self.__policy_list = Policy() self.__policy = policy self.__by_priority = any(self.__policy == pol for pol in self.__policy_list.uses_priority()) self.__setted_sort = self.__by_priority or policy == self.__policy_list.sjf self.__all = [] self.__ready = [] self.__finished = 0 self.__number_of_tasks = number_of_tasks
def lspi(maxiter, epsilon, samples, basis, discount, initial_policy): """ Runs the LSPI algorithm """ iteration = -1 distance = float('inf') policy = initial_policy all_policies = [initial_policy] while (iteration < maxiter) and (distance > epsilon): # print the number of iterations iteration = iteration + 1 print ('============================') print 'LSPI iteration: %i' % iteration if iteration == 0: firsttime = 1 else: firsttime = 0 policy = Policy(policy=policy) policy.weights = lstdq(samples, all_policies[iteration], policy)[0] diff = policy.weights - all_policies[iteration].weights LMAXnorm = LA.norm(diff, np.inf) L2norm = LA.norm(diff) distance = L2norm all_policies.append(policy) print '================================' if distance > epsilon: print 'LSPI finished in %i iterations WITHOUT convergence to a fixed point' % iteration else: print 'LSPI converged in %i iterations' % iteration print print 'weights' print policy.weights print return policy, all_policies
def init(self): with open('../config/config.yaml') as f: _, config_features, _ = yaml.load_all(f) self.features = config_features self.redis = redis.StrictRedis('localhost') #self.redis.flushdb() self.redis.set_response_callback('HGETALL', self.hgetall_custom_callback) self.policy = Policy(self.redis) self.add_default_policy()
def main(): bellogFilename = None queryString = None datalogFilename = None opts, args = getopt.getopt(sys.argv[1:], 'i:q:o:', ['input', 'query']) for o, a in opts: if o == '-i': bellogFilename = a elif o == '-q': queryString = a elif o == '-o': datalogFilename = a if bellogFilename is None and (queryString is None or datalogFilename is None): print 'Usage: python', sys.argv[0], '-i <BelLog file> -q <query> [-o <Datalog filename>]' sys.exit(-1) fileStr = open(bellogFilename, 'r').read().strip() polStr = '\n'.join([l for l in fileStr.split('\n') if ':-' in l]) try: policy = Policy.fromString(escapeCharacters(polStr)) if queryString is not None: query = Atom.fromElements(Grammar.parseAtom(escapeCharacters(queryString))) policy.processPolicy() except Exception as e: print 'Error parsing the policy:', e sys.exit(-1) if queryString is not None: xsb = XSB() try: policy.checkQuery(query) xsb.loadPolicy(policy) print 'Query', queryString, ':', xsb.query(query) xsb.close() except Exception as e: print 'Error loading the policy:', e sys.exit(-1) xsb.close() if datalogFilename is not None: if os.path.isfile(datalogFilename): msg = 'Override ' + datalogFilename + '?' shall = True if raw_input("%s (y/N) " % msg).lower() == 'y' else False if not shall: sys.exit(-1) outFile = open(datalogFilename, 'w') for rule in policy.rules: for datalogRule in rule.toDatalogRules(): outFile.write(datalogRule + '.\n') for datalogRule in XSB.STATIC_RULES: outFile.write(datalogRule + '.\n') outFile.close()
def value_iteration(mdp, gamma=0.9, epsilon=0.0001): states = mdp.states actions = mdp.actions policy = Policy(mdp.states, mdp.actions) Vcurrent = np.zeros(len(mdp.states)) Vprevious = None fix_point = False while not fix_point: Vprevious = deepcopy(Vcurrent) for fromstate in states: values = [] for action in mdp.actions: value = 0. for tostate in mdp.get_neighbors(fromstate): p = mdp.get_probability(action, fromstate, tostate) r = mdp.get_reward(action, fromstate, tostate) v = Vprevious[tostate] value += p * (r + gamma * v) values.append(value) Vcurrent[fromstate] = max(values) del values fix_point = np.linalg.norm(Vcurrent - Vprevious, np.inf) < epsilon for fromstate in states: values = [] for action in actions: value = 0. for tostate in mdp.get_neighbors(fromstate): p = mdp.get_probability(action, fromstate, tostate) r = mdp.get_reward(action, fromstate, tostate) v = Vcurrent[tostate] value += p * (r + gamma * v) values.append(value) acts = np.argwhere(values == np.amax(values)).flatten().tolist() for a in acts: policy.set_probability(1. / len(acts), fromstate, a) for a in [ac for ac in actions if ac not in acts]: policy.set_probability(0., fromstate, a) del values return Vcurrent, policy
def read(self): try: users_fd = open(self.users_file, "r") policies_fd = open(self.policies_file, "r") except IOError as e: self.log(e, "E") return False for line in users_fd.readlines(): l = line.strip().encode().split(b":") # TODO: move this check somewhere else if len(l) > 2: self.log("More than one ':' while parsing users file?", "E") return False if b"group" in l[0][0:5]: self.key_manager.add_group(l[0].split(b" ")[1]) for u in l[1].strip().split(b" "): self.key_manager.add_group_member(u, l[0].split(b" ")[1]) else: self.key_manager.add_user(l[0].strip(), l[1].strip()) users_fd.close() cnt = 0 for line in policies_fd.readlines(): cnt += 1 tokens = line.strip().encode().split(b" ") policy = Policy() policy.parameters = [] prev_token = None for token in tokens: if prev_token == b'-u': policy.user = token elif prev_token == b'-g': policy.group = token elif prev_token == b'-p': policy.parameters.append(token) prev_token = token policy.script = tokens[-1] if not self.policy_manager.add_policy(policy): self.log(" File %s line %d" % (config["path_to_policies"], cnt), "E") policies_fd.close()
class BotPlayer(Player): """Player when played by program Action to take for a hand is decided according to a policy. At first, the policy is based in MDP """ def __init__(self, policy, dealer): super(BotPlayer, self).__init__() self.policy = Policy(policy) self.dealer = dealer def choose_action(self): # Policy determines choice choice = self.policy.action(self.hand_value, self.dealer.cards[0]) return choice
def test_subject_feature_reset_time(self): """ This method validates if the subject feature is reseted properly. """ self.init() self.redis.delete('AI:metadata:[email protected]') for index in range(0, 3): metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' from_ipaddress = '189.10.21.1' milter_subject = 'Repeated Subject' # Enable feeder feature feat.feederFeaturesEnabled = True # 0.5 second in hour pdata.subjectprobation = 0.000138888889 feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) tmetadata = ThemisMetaData(**self.redis.hgetall('AI:metadata:[email protected]')) #print tmetadata.subject_repeated_count # If time sleep is 0.1 the subject_repeated_count will be two (2) time.sleep(0.5) # Must return 1 because the subject is repeated and always reseted to 1 score_milter_object = self.redis.zrange('AI:subjectReputationFeature:groupby', 0, -1, withscores=True)[0][1] self.assertTrue(tmetadata.subject_repeated_count == 1 and score_milter_object == 1)
def test_subject_feature_count(self): """ This method validates if the subject feature is counted properly, generating 3 requests with the same subject, then changing it to validate if the count is correct """ self.init() self.redis.delete('AI:metadata:[email protected]') for index in range(0, 5): metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' from_ipaddress = '189.10.21.1' if index == 4: milter_subject = 'Must NOT Update Subject Count, because subject title is different at last loop' else: milter_subject = 'Repeated Subject until index is 3' # Enable feeder feature feat.feederFeaturesEnabled = True feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) tmetadata = ThemisMetaData(**self.redis.hgetall('AI:metadata:[email protected]')) # Must be null because at the index 4 the subject is changed, so the namespace is deleted by this object score_milter_object = self.redis.zrange('AI:subjectReputationFeature:groupby', 0, -1, withscores=True) # Should assert True with 3 because at the index 4 I change the subject name self.assertTrue(tmetadata.subject_repeated_count == 3 and not score_milter_object)
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() #TODO Change init_gym for one of my functions env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H:%M:%S").replace( ":", "_") # create unique directories logger = Logger(logname=env_name, now=now) aigym_path = os.path.join('/tmp', env_name, now) #Change wrappers.Monitor for a class of mine that controls de simulation #Creo que el wrapper no sirve de nada para mi ejemplo #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def policy_iteration(self, policy=None): w = self.world if policy == None: policy = Policy.build_deterministic([0] * w.width * w.height) while True: old_policy = policy vs = self.iterative_policy_evaluation(policy, 20) qvs = self.qvs_from_vs(vs) policy = self.qvs_to_policy(qvs) if policy == old_policy: break return policy
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar, clipping_range): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar, clipping_range) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 while episode < num_episodes: # trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) # episode += len(trajectories) # add_value(trajectories, val_func) # add estimated values to episodes # add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs # add_gae(trajectories, gamma, lam) # calculate advantage # # concatenate all episodes into single NumPy arrays # observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # # add various stats to training log: # log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) # policy.update(observes, actions, advantages, logger) # update policy # val_func.fit(observes, disc_sum_rew, logger) # update value function # logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False if episode % 100 == 0: policy.save_sess() logger.close() policy.close_sess() val_func.close_sess()
def run_cartpole(): env = gym.make('CartPole-v0') obs_dim = np.prod(env.observation_space.shape) n_actions = env.action_space.n policy_hidden_dim = 256 policy = Policy(obs_dim, policy_hidden_dim, n_actions) exp = Experiment(policy, None, env, exp_name="cartpole_basic", train_model=False, calc_inf_gain=False) exp.train()
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ n_s = env.spec.nS n_a = env.spec.nA trans_mat = env.TD reward_matrix = env.R delta = theta v = initV q = np.zeros((n_s, n_a)) while delta >= theta: delta = 0 for s in range(n_s): current_state_val = v[s] result = 0 for a in range(n_a): trans = trans_mat[s][a] sum_val = 0 for i in range(len(trans)): next_state = i prob = trans[i] sum_val += (prob * (reward_matrix[s][a][next_state] + (env.spec.gamma * v[next_state]))) q[s][a] = sum_val result += pi.action_prob(s, a) * sum_val v[s] = result delta = max(delta, abs(v[s] - current_state_val)) V = v Q = q return V, Q
def parseJsonDict(self, jsonReport): """ Parses the given 'jsonReport' according to the parameters set in the constructor of this ReportParser and returns a Report object. 'jsonReport' is expected to be a Python dict object with attribute names and values corresponding to the definition of CSP violation reports. If 'jsonReport' cannot be parsed because it is syntactically invalid (or empty), Report.INVALID() will be returned. Depending on the configuration of this ReportParser object, some attributes will be parsed to replace their plain string values with a more high-level object representation. """ # replace names renamedReport = dict(map(lambda (key, val): (self._replaceName(key), val), jsonReport.iteritems())) # convert data in report convertedReport = {} deferredSelfURIs = set([]) # all key names that have URIs that are exactly 'self' (handle after parsing everything else) for (key, value) in renamedReport.iteritems(): if key in self._uriKeys: if value.lower().strip() == "self": deferredSelfURIs.add(key) continue else: value = self._uriParser.parse(value) elif key in self._directiveKeys: value = self._directiveParser.parse(value) elif key in self._policyKeys: value = self._policyParser.parse(value) if value in (URI.INVALID(), Directive.INVALID(), Policy.INVALID()): if self._strict: return Report.INVALID() else: continue convertedReport[key] = value # handle deferred parsing of 'self' URIs (they represent the document-uri) for key in deferredSelfURIs: if "document-uri" in self._uriKeys and "document-uri" in convertedReport: convertedReport[key] = convertedReport["document-uri"] elif self._strict: return Report.INVALID() for requiredKey in self._requiredKeys: if not requiredKey in convertedReport: return Report.INVALID() return Report(convertedReport)
def test_tile(): env = gym.make("MountainCar-v0") gamma = 1. policy = Policy() V = ValueFunctionWithTile(env.observation_space.low, env.observation_space.high, num_tilings=10, tile_width=np.array([.45, .035])) semi_gradient_n_step_td(env, 1., policy, 10, 0.01, V, 1000) Vs = [V(s) for s in testing_states] print(Vs) assert np.allclose( Vs, correct_values, 1e-2, 3), f'{correct_values} != {Vs}, but it might due to stochasticity'
def handle(self): # self.request is the TCP socket connected to the client self.data = self.request.recv(128).strip() data = eval(self.data) if data.has_key('host'): Reg.regitsger(data) return plugin_id = data['id'] MyQueue.heartbeat_queue.put(data) print u"连接来自插件:", plugin_id ret_list = Policy.if_update(plugin_id) print u"ret_list:", ret_list, plugin_id strlist = string.join(ret_list, ":") self.request.sendall(strlist) print u"sendall:", ret_list, plugin_id
def set_policy(self, policy): """ Change the current execution policy. If the given name is not a known policy, it will remain unchanged. """ data = {"status": "OK"} orig_policy = self.policy self.policy = Policy.factory(policy) if self.policy is None: data["status"] = "KO" data["message"] = "Policy {} is not a known policy".format(policy) self.error_log("set_policy", "Policy {} is not a known policy".format(policy)) self.policy = orig_policy return json.dumps(data).encode()
class FinalModel: def __init__(self, env): # Load your Model here self.sess = tf.Session() self.saver = tf.train.import_meta_graph('policy_model/test.meta') self.saver.restore(self.sess, tf.train.latest_checkpoint('policy_model/')) self.action_size = env.action_space.shape[0] self.policy = Policy(env.observation_space.shape[0] + 1, self.action_size, 0.003, 10, -1.0, None) def get_action(self, state): # change to your model obs = state.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[0]], axis=1) # add time step feature action = self.policy.sample(obs).reshape((1, -1)).astype(np.float32) return np.squeeze(action, axis=0)
def initAgent(self): # micro-loan business simulation environment instance env = SimulationEnv() # feature transformer instance to convert numerous outputs of environment into simple numeric variables understood by the RL agent ft = FeatureTransformer(env) # value function model instance - the brain of the RL agent. Approximates value of each action in every state of environment lr = 0.0001 # learning rate defines how adaptive the value function is to new input model = Model(env, ft, lr) # environment model instance - the planning center of the agent. Predicts future environment states based on the current one env_model = EnvironmentModel(env, lr) # policy instance - includes different kinds of behaviors the agent can use to interact with the environment policy = Policy(env) # the agent instance - the guy that uses all of the above in order to optimize whatever you need eps = 1 # exploration rate defines how much randomness to use to explore the environment gamma = 0.95 # discounting rate defines how quick the agent forgets his previous experience agent = Agent(env, model, env_model, policy, eps, gamma, gamma) return agent
def mutation(mutate_pop, p=0.05): policy = mutate_pop[0] new_policy = Policy(policy.shape, policy.hidden_units, policy.num_actions, policy.game) for i in range(len(policy.W)): w = np.zeros((policy.W[i].shape[0], policy.W[i].shape[1])) b = np.zeros(policy.B[i].shape[0]) for j in range(len(policy.W[i])): for k in range(len(policy.W[i][j])): w[j][k] = policy.W[i][j][k] + p * np.random.normal() for j in range(len(policy.B[i])): b[j] = policy.B[i][j] + p * np.random.normal() new_policy.W.append(w) new_policy.B.append(b) return new_policy
def main(env_name, num_episodes, gamma, lam, kl_targ, batch_size, TestNote): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ print('Testing Period:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) env.set_goals(0) now = datetime.now().strftime("%b-%d_%H:%M:%S") # create unique directories 格林尼治时间!!! utcnow改为now testname = now+'-'+TestNote logger = Logger(logname=env_name, now=testname) aigym_path = os.path.join('log-Test-files', env_name, testname) env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # run a few episodes of untrained policy to initialize scaler: policy.load_model('/home/drl/PycharmProjects/warker_test/log-files/My3LineDirect-v1/Jan-10_07:51:34-A003-SpecGoal-itr15000-g0ExpNo5/checkpoint/My3LineDirect-v1-15000.ckpt') episode = 0 observes, actions, rewards, unscaled_obs, states_x, states_y= rollout(env, policy, scaler, max_path_length=batch_size,animate=True) tmp=np.vstack((rewards,states_x,states_y)) tmp1=np.transpose(tmp) data = np.concatenate((observes, actions, tmp1),axis=1) trajectory = {} for j in range(data.shape[0]): for i in range(data.shape[1]): trajectory[i] = data[j][i] logger.log(trajectory) logger.write(display=False) logger.close() policy.close_sess() val_func.close_sess() print('End time:\n') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def test(rank, shared_model, counter): with open('log.txt', 'w'): pass env_name = 'Pong-v0' env = gym.make(env_name) env.seed(rank) torch.manual_seed(rank) policy = Policy() policy.eval() policy.cpu() start_time = time.time() while True: policy.load_state_dict(shared_model.state_dict()) h = rnn_model.init_() state = env.reset() reward_sum = 0 episode_length = 0 while True: env.render() a, h = wow(state, h, policy, vae_model, rnn_model) state, reward, done, _ = env.step(a) reward_sum += reward if done: string = "Time {}, num steps {}, FPS {:.0f}, episode reward {}, episode length {}".format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), counter.value, counter.value / (time.time() - start_time), reward_sum, episode_length) print(string) with open('log.txt', 'a') as f: f.write(string + '\n') time.sleep(5) break
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### nS = env.spec.nS nA = env.spec.nA gamma = env.spec.gamma TD = env.TD R = env.R V = initV Q = np.zeros((nS,nA)) while True: delta = 0; for i in range(nS): prevVal = V[i] action_sum_temp = 0 for j in range(nA): act_pr = pi.action_prob(i,j) action_sum_temp = action_sum_temp + act_pr * sum(TD[i,j,:] * (R[i,j,:] + gamma*V)) V[i] = action_sum_temp delta = max(delta, abs(V[i]-prevVal) ) if delta<theta: break for s in range(nS): for a in range(nA): Q[s,a] = sum(TD[s,a,:] * (R[s,a,:] + gamma*V)) return V, Q
def optimal(self, taxes=(), entitlement=(), militaristic=False): return Policy( self.name + "_optimal", taxes, self.proImm, self.prolgbt, self.proWar, entitlement, ( self.race["white"], self.race["black"], self.race["amind"], self.race["asian"], self.race["hawaii"], self.race["other"], ), self.proWar, militaristic, )
def invade(botnet, network, printing=False): """ Let the botnet invade once the network. :param botnet: a (learning) botnet, set up on this network, and which must have been cleared beforehand :param network: the network to invade :param printing: if True, prints all the details about the invasions :return: a list of all (action, result), the total reward received, and the expected reward of the induced policy (which is constructed by taking the successful actions) """ actions = [] reward = 0 policy = [] t = 0 while not botnet.state.is_full(): action = botnet.choose_action() if printing: print("Action ", t) print("Remaining nodes = %s" % botnet.state.nb_remaining()) print("Attack %s!" % action) success = network.attempt_hijacking(botnet.state, action) if success: policy.append(action) if printing: print("Success\n") else: if printing: print("Failure\n") immediate_reward = network.immediate_reward(botnet.state, action) if success and botnet.state.add(action).is_full(): immediate_reward += botnet.gamma * network.final_reward( botnet.gamma) reward += botnet.time_factor * immediate_reward botnet.receive_reward(action, success, immediate_reward) actions.append((action, success)) t += 1 return actions, reward, Policy(network, policy).expected_reward(botnet.gamma)
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: nA = env.spec.nA # Number of actions nS = env.spec.nS # Number of states V = np.zeros(nS) R = env.R # Reward function T = env.TD # State transition function gamma = env.spec.gamma # Gamma """ Iteratively sweep through all states, """ while True: delta = 0 for s in range(nS): value = 0 for a in range(nA): prob = pi.action_prob(s, a) for sp in range(nS): value += (prob * (T[s, a, sp] * (R[s, a, sp] + gamma * initV[sp]))) V[s] = value delta = max(delta, abs(initV[s] - V[s])) """ Check for convergence """ if delta < theta: break initV = V.copy() # Must make an explicit copy """ With the values function having converged, we can now extract the optimal action probabilities for each state, using the Bellman optimality equation. """ Q = np.zeros(shape=(nS, nA)) for s in range(nS): for a in range(nA): for sp in range(nS): Q[s, a] += T[s,a,sp] * (R[s,a,sp] + gamma * V[sp]) return V, Q
def main(env_name, num_episodes, render, gamma, lam, kl_targ, batch_size): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name, render) if time_state: obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = datetime.utcnow().strftime("%b-%d_%H-%M-%S") # create unique directories logger = Logger(logname=env_name, now=now) scaler = Scaler(obs_dim, env_name) val_func = NNValueFunction(obs_dim, env_name) policy = Policy(obs_dim, act_dim, kl_targ, env_name) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, episodes=5) episode = 0 #capture = False while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, episodes=batch_size) episode += len(trajectories) """if episode > 600 and not capture: env.ScreenCapture(5) capture = True""" add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout scaler.save() if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() policy.close_sess() val_func.close_sess()
def main(num_episodes, gamma, lam, kl_targ, batch_size, hid1_mult, policy_logvar): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch hid1_mult: hid1 size for policy and value_f (mutliplier of obs dimension) policy_logvar: natural log of initial policy variance """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym() env_name = "retro" obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) #obs_dim = 215041 obs_dim = 7057 now = datetime.utcnow().strftime( "%b-%d_%H:%M:%S") # create unique directories #aigym_path = os.path.join('/tmp', env_name, now) #env = wrappers.Monitor(env, aigym_path, force=True) scaler = Scaler(obs_dim) val_func = NNValueFunction(obs_dim, hid1_mult) policy = Policy(obs_dim, act_dim, kl_targ, hid1_mult, policy_logvar) # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, episodes=5) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, episodes=batch_size) episode += len(trajectories) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set( trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, episode) policy.update(observes, actions, advantages) # update policy val_func.fit(observes, disc_sum_rew) # update value function if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False policy.close_sess() val_func.close_sess()
def train_and_eval(cfg: BaseConfig): if cfg.path is None: print('cfg.path is None, so FasterAutoAugment is not used') policy = None else: path = Path(hydra.utils.get_original_cwd()) / cfg.path assert path.exists() policy_weight = torch.load(path, map_location='cpu') policy = Policy.faster_auto_augment_policy( num_chunks=cfg.model.num_chunks, **policy_weight['policy_kwargs']) policy.load_state_dict(policy_weight['policy']) train_loader, test_loader, num_classes = DATASET_REGISTRY(cfg.data.name)( batch_size=cfg.data.batch_size, drop_last=True, download=cfg.data.download, return_num_classes=True, norm=[ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ], num_workers=4) model = MODEL_REGISTRY(cfg.model.name)(num_classes) optimizer = optim.SGD(cfg.optim.lr, momentum=cfg.optim.momentum, weight_decay=cfg.optim.weight_decay, nesterov=cfg.optim.nesterov) scheduler = lr_scheduler.CosineAnnealingWithWarmup( cfg.optim.epochs, cfg.optim.scheduler.mul, cfg.optim.scheduler.warmup) tqdm = callbacks.TQDMReporter(range(cfg.optim.epochs)) c = [callbacks.LossCallback(), callbacks.AccuracyCallback(), tqdm] with EvalTrainer(model, optimizer, F.cross_entropy, callbacks=c, scheduler=scheduler, policy=policy, cfg=cfg.model, use_cuda_nonblocking=True) as trainer: for _ in tqdm: trainer.train(train_loader) trainer.test(test_loader) print(f"Min. Error Rate: {1 - max(c[1].history['test']):.3f}")
def main(): RESTORE_MIMIC = False TRAIN = False EVALUATE_MIMIC = False VISUALIZE_MIMIC = True VISUALIZE_MASTER = False # Open AI gym environment env, obs_dim, act_dim = init_gym('Hopper-v1') # Make master policy which was trained by R masterpolicy = Policy(obs_dim, act_dim, 0.003, stochastic_policy=False) masterpolicy.restore_weights() scaler = Scaler(obs_dim) if VISUALIZE_MASTER: masterpolicy.visualize(env) print("Loaded masterpolicy. ") mimicpolicy = ReactivePolicy(obs_dim, act_dim) if RESTORE_MIMIC: mimicpolicy.restore_weights() if TRAIN: # Train the mimic by master t1 = time.time() train_mimic(mimicpolicy, masterpolicy, env, scaler, n_episodes=25, batchsize=32) print("Training time taken: {}".format(time.time() - t1)) mimicpolicy.save_weights() print("Saved mimic weights") if EVALUATE_MIMIC: # Evaluate the policy by MSE print("Starting evaluation...") mse = evaluate_mimic(mimicpolicy, masterpolicy, scaler, env, 10) print("Average MSE of mimic: {}".format(mse)) if VISUALIZE_MIMIC: # Visualise the policy print("Visualizing policy") mimicpolicy.visualize(env)
def value_prediction(env: EnvWithModel, pi: Policy, initV: np.array, theta: float) -> Tuple[np.array, np.array]: """ inputs: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### V = initV.copy() #Arbitrary except terminal states must be 0 delta = theta while delta >= theta: delta = 0 for s in range(env.spec.nS): v = V[s] #action probabilities: pi.action_prob(s) is an array (size nA) #transition dynamics: env.TD[state,action,state_t+1] is an array of probabilities (size nS) #rewards: env.R[state,action,state_t+1] update = 0 for a in range(env.spec.nA): #Note below after action_prob the extra zero is because the array is wrapped in another array since array[None] #puts the array in another array. Though this should always happen since action_prob shouldn't be passed an action update += pi.action_prob(s, a) * np.sum( env.TD[s, a, :] * (env.R[s, a, :] + env.spec.gamma * V)) V[s] = update delta = max(delta, np.abs(v - V[s])) Q = np.zeros((env.spec.nS, env.spec.nA)) for s in range(env.spec.nS): for a in range(env.spec.nA): Q[s, a] = np.sum(env.TD[s, a, :] * (env.R[s, a, :] + env.spec.gamma * V)) #Note: summing over the actions in pi(a|s)*Q(s,a) gives V(s) return V, Q
def test_botnet(botnet, network, nb_trials, window_size=1, real_rewards=False, induced_rewards=False, policy_rewards=True, show=False): """ Plots the expected reward of the induced policy over trainings, and prints the expected reward of the computed policy. :param botnet: :param network: :param nb_trials: :param window_size: :param real_rewards: whether to plot the real rewards received during the training :param induced_rewards: whether to plot the expected rewards of the induced policy :param policy_rewards: whether to plot the expected rewards of the full-exploitation policy :param show: if True, shows the results :return: """ rewards, expected, policy = train(botnet, network, nb_trials) if real_rewards: plot_with_legend(list(range(nb_trials)), soften(rewards, window_size), legend=botnet.type + " real") if induced_rewards: plot_with_legend(list(range(nb_trials)), soften(expected, window_size), legend=botnet.type + " induced") if policy_rewards: plot_with_legend(list(range(nb_trials)), soften(policy, window_size), legend=botnet.type + " policy") print(botnet.compute_policy()) print(botnet.type, Policy(network, botnet.compute_policy()).expected_reward(botnet.gamma), sep='\t') if show: show_with_legend()
def value_prediction(env:EnvWithModel, pi:Policy, initV:np.array, theta:float) -> Tuple[np.array,np.array]: """ inp: env: environment with model information, i.e. you know transition dynamics and reward function pi: policy initV: initial V(s); numpy array shape of [nS,] theta: exit criteria return: V: $v_\pi$ function; numpy array shape of [nS] Q: $q_\pi$ function; numpy array shape of [nS,nA] """ ##################### # TODO: Implement Value Prediction Algorithm (Hint: Sutton Book p.75) ##################### num_states = env.spec.nS num_actions = env.spec.nA V = np.array(initV) Q = np.zeros((num_states,num_actions)) R = env.R TD = env.TD change = theta + 1 while change > theta: change = 0 for i in range(num_states): old_v = V[i] new_v = 0 for j in range(num_actions): sum_a = 0 for k in range(num_states): sum_a += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k]) new_v += pi.action_prob(i,j)*sum_a change = max(change, abs(new_v-old_v)) V[i] = new_v for i in range(num_states): for j in range(num_actions): for k in range(num_states): Q[i][j] += TD[i,j,k]*(R[i,j,k]+env.spec.gamma*V[k]) return V, Q
def main(num_episodes, gamma, lam, kl_targ, batch_size, env_name="Hopper-v2"): """ Main training loop Args: env_name: OpenAI Gym environment name, e.g. 'Hopper-v1' num_episodes: maximum number of episodes to run gamma: reward discount factor (float) lam: lambda from Generalized Advantage Estimate kl_targ: D_KL target for policy update [D_KL(pi_old || pi_new) batch_size: number of episodes per policy training batch """ killer = GracefulKiller() env, obs_dim, act_dim = init_gym(env_name) obs_dim += 1 # add 1 to obs dimension for time step feature (see run_episode()) now = (datetime.datetime.utcnow() - datetime.timedelta(hours=4)).strftime("%b-%d_%H:%M:%S") # create dictionaries based on ets time logger = Logger(logname=env_name, now=now) plotter = Plot(plotname=env_name+"-Fig", now=now) aigym_path = os.path.join('/tmp', env_name, now) # env = wrappers.Monitor(env, aigym_path, force=True) # recording, dir?? scaler = Scaler(obs_dim) # obs_dim=377 val_func = NNValueFunction(obs_dim) policy = Policy(obs_dim, act_dim, kl_targ) # kl target=0.003 by default # run a few episodes of untrained policy to initialize scaler: run_policy(env, policy, scaler, logger, plotter, episodes=5, plot=False) episode = 0 while episode < num_episodes: trajectories = run_policy(env, policy, scaler, logger, plotter, episodes=batch_size) episode += len(trajectories) # length of trajectories equals batch size which by default is 20 plotter.updateEpisodes(episode) add_value(trajectories, val_func) # add estimated values to episodes add_disc_sum_rew(trajectories, gamma) # calculated discounted sum of Rs add_gae(trajectories, gamma, lam) # calculate advantage # concatenate all episodes into single NumPy arrays observes, actions, advantages, disc_sum_rew = build_train_set(trajectories) # add various stats to training log: log_batch_stats(observes, actions, advantages, disc_sum_rew, logger, episode) policy.update(observes, actions, advantages, logger, plotter) # update policy val_func.fit(observes, disc_sum_rew, logger) # update value function logger.write(display=True) # write logger results to file and stdout if killer.kill_now: if input('Terminate training (y/[n])? ') == 'y': break killer.kill_now = False logger.close() plotter.plot() # plt.show() policy.close_sess() val_func.close_sess()
def worker_policy_usr(args, manager, config): init_logging_handler(args.log_dir, '_policy_usr') if args.config == 'multiwoz': print("MultiWoz Agent Usr") agent = Policy(None, args, manager, config, 0, 'usr', True) elif args.config == 'dstcsgds': print("DSTC Agent Usr") agent = DSTCPolicy(None, args, manager, config, 0, 'usr', True) else: raise NotImplementedError( 'Policy usr of the dataset {} not implemented'.format(args.config)) best = float('inf') for e in range(2): agent.imitating(e) best = agent.imit_test(e, best)
class FinalModel: def __init__(self, env): # Load your Model here self.action_size = env.action_space.shape[0] self.policy = Policy(env.observation_space.shape[0] + 1, self.action_size, 0.003, 10, -1.0, None) self.saver = tf.train.import_meta_graph( 'weights/50score/100000_episodes.meta') self.saver.restore(self.policy.sess, tf.train.latest_checkpoint('weights/50score/')) def get_action(self, state): # scale, offset = Scaler.get() # scale[-1] = 1.0 # offset[-1] = 0.0 # change to your model obs = state.astype(np.float32).reshape((1, -1)) obs = np.append(obs, [[0]], axis=1) # add time step feature action = self.policy.sample(obs).reshape((1, -1)).astype(np.float32) return np.squeeze(action, axis=0)
def run_cartpole_expl(): env = gym.make('CartPole-v0') obs_dim = np.prod(env.observation_space.shape) act_dim = np.prod(env.action_space.shape) n_actions = env.action_space.n policy_hidden_dim = 256 policy = Policy(obs_dim, policy_hidden_dim, n_actions) input_dim = int(obs_dim + act_dim) output_dim = int(obs_dim) hidden_dim = 64 model = BNN(input_dim, hidden_dim, output_dim) exp = Experiment(policy, model, env, exp_name="cartpole_expl", train_model=True, calc_inf_gain=True) exp.train()
def __init__(self, env, sess, horizon, epsilon, learning_rate_policy, learning_rate_value, gamma, lam, logger): self.env = env self.sess = sess self.horizon = horizon self.epsilon = epsilon self.learning_rate_policy = learning_rate_policy self.learning_rate_value = learning_rate_value self.gamma = gamma self.lam = lam self.logger = logger self.observation_space = env.observation_space.shape[0] self.action_space = env.action_space.shape[0] self.policy = Policy(self.observation_space, self.action_space, self.epsilon, self.learning_rate_policy) self.value_function = Value_function(self.observation_space, self.learning_rate_value) self.replay_memory = ReplayMemory(self.horizon, self.observation_space, self.action_space)
def featureQLearning(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ qFunc = r + max_{a'}(\gamma * q(\hat{s'}, a')) """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(policy)) state = random.choice(mdp.states) sFeature = mdp.getFeature(state) action = random.choice(mdp.actions) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) maxQ = -1.0 for nextAction in mdp.actions: q = policy.qFunc(nextSFeature, nextAction) if maxQ < q: maxQ = q policy.update(sFeature, action, reward + mdp.gamma * maxQ, alpha) action = policy.epsilonGreedy(nextSFeature, epsilon) state = nextState sFeature = nextSFeature count += 1 if echoSE: return policy, squareErrors else: return policy
def policySARSA(mdp, epsilon, alpha, iterNum, maxWalkLen=100, echoSE=False): """ Actor-Critic: actor update the policy, critic update the value. """ InitParameter = 0.1 if echoSE: squareErrors = [] policy = SoftmaxPolicy(mdp) valuePolicy = Policy(mdp) for i in range(len(policy.parameters)): policy.parameters[i] = InitParameter valuePolicy.parameters[i] = 0.0 for _ in range(iterNum): if echoSE: squareErrors.append(getSquareErrorPolicy(valuePolicy)) state = random.choice(mdp.states) sFeature = mdp.getFeature(state) action = random.choice(mdp.actions) isTerminal = False count = 0 while not isTerminal and count < maxWalkLen: isTerminal, nextState, reward, nextSFeature = mdp.transform(state, action) nextAction = policy.epsilonGreedy(nextSFeature, epsilon) valuePolicy.update(sFeature, action, reward + mdp.gamma * valuePolicy.qFunc(nextSFeature, nextAction), alpha) policy.update(sFeature, action, valuePolicy.qFunc(sFeature, action), alpha) sFeature = nextSFeature action = nextAction count += 1 if echoSE: return policy, squareErrors else: return policy
def savedPolicies(self): from policy import Policy return Policy.query(ancestor = self.key).filter(Policy.onSyllabus == False).fetch()
self.u=tf.tanh(sampleNormal(mu,tf.exp(logsigma)),name="u") self.policy_vars = [v for v in tf.all_variables() if v.name.startswith(vs.name)] print([v.name for v in self.policy_vars]) def eval(self,sess,x): return sess.run(self.u,{self.x:x}) def set_reward(self,r): # set objectie to minimize tensor -R self.reward = r # scalar self.buildTrain(1e-4) self.buildSummaries() def buildTrain(self,learning_rate): with tf.variable_scope("Optimizer"): optimizer=tf.train.AdamOptimizer(learning_rate, beta1=0.1, beta2=0.1) # beta2=0.1 # maximize reward self.train_op=optimizer.minimize(-self.reward, var_list=self.policy_vars) def buildSummaries(self): tf.scalar_summary("R", self.reward) self.all_summaries = tf.merge_all_summaries() def update(self,sess,feed_dict, write_summary=False): fetches=[self.reward,self.train_op] if write_summary: fetches.append(self.all_summaries) return sess.run(fetches,feed_dict) Policy.register(PlanePolicy)
def policies(self): from policy import Policy return Policy.query(ancestor = self.key).fetch()
#self.u=tf.tanh(sampleNormal(mu,tf.exp(logsigma)),name="u") self.policy_vars = [v for v in tf.all_variables() if v.name.startswith(vs.name)] print([v.name for v in self.policy_vars]) def eval(self,sess,x): return sess.run(self.u,{self.x:x}) def set_reward(self,r): # set objectie to minimize tensor -R self.reward = r # scalar self.buildTrain(1e-4) self.buildSummaries() def buildTrain(self,learning_rate): with tf.variable_scope("Optimizer"): optimizer=tf.train.AdamOptimizer(learning_rate, beta1=0.1, beta2=0.1) # beta2=0.1 # maximize reward self.train_op=optimizer.minimize(-self.reward, var_list=self.policy_vars) def buildSummaries(self): tf.scalar_summary("R", self.reward) self.all_summaries = tf.merge_all_summaries() def update(self,sess,feed_dict, write_summary=False): fetches=[self.reward,self.train_op] if write_summary: fetches.append(self.all_summaries) return sess.run(fetches,feed_dict) Policy.register(VisuomotorPolicy)
#!/bin/env python from policy import Policy import numpy as np class RandomPolicy(Policy): ''' random policy - for benchmarking E2C model on a 'static' dataset ''' def __init__(self, batch_size, x_dim, u_dim): super(RandomPolicy, self).__init__(batch_size, x_dim, u_dim) def eval(self, sess, x): return np.random.uniform(low=-1.,high=1.,size=[x.shape[0],self.u_dim]) #np.random.randn(self.u_dim) Policy.register(RandomPolicy)
def eval(self,sess,x): # ergodicity: if np.random.rand() < .1: return np.random.uniform(low=-1.,high=1.,size=self.u_dim) else: return sess.run(self.u,{self.x:x}) def set_reward(self,r): # set objectie to minimize tensor -R self.reward = r # scalar self.buildTrain(1e-4) self.buildSummaries() def buildTrain(self,learning_rate): with tf.variable_scope("Optimizer"): optimizer=tf.train.AdamOptimizer(learning_rate, beta1=0.1, beta2=0.1) # beta2=0.1 # maximize reward self.train_op=optimizer.minimize(-self.reward, var_list=self.policy_vars) def buildSummaries(self): tf.scalar_summary("R", self.reward) self.all_summaries = tf.merge_all_summaries() def update(self,sess,feed_dict, write_summary=False): fetches=[self.reward,self.train_op] if write_summary: fetches.append(self.all_summaries) return sess.run(fetches,feed_dict) Policy.register(SimplePolicy)
class Obj: """ 群体: """ def __init__(self, name, init_x, init_y, ch, color): self.name = name _fn = "dot%s" % self.name self.backup = basic.BackUp(_fn) self.chr = ch self.color = color self.V = (0., 0., 0., 0.,) """群体的个人集合 """ self.P = [] """性别情况 """ self.male = 0 self.female = 0 """怀孕人数 """ self.mating = 0 self.policy = Policy() self.X = init_x + self.policy._func() self.Y = init_y + self.policy._func() self.load() def move(self): self.V = self.policy.getPolicy() self.X = self.X + (self.V[0]-self.V[2]) self.Y = self.Y + (self.V[1]-self.V[3]) def getPosition(self): return int(self.X), int(self.Y), self.chr, self.color def save(self): _data = { "x": self.X, "y": self.Y, "ch": self.chr, "c": self.color, "persons": [] } for _p in self.P: _data['persons'].append(_p.show_name()) _p.save() self.backup.save(_data) def load(self): _info = self.backup.load() if _info is not None: self.X = _info["x"] self.Y = _info["y"] self.chr = _info["ch"] self.color = _info["c"] for _name in _info["persons"]: _p = resource.Ps(_name) if _p.show_sex() == 'Male': self.male += 1 else: self.female += 1 self.P.append(_p) else: """初值:每个群体100人 """ for _i in range(100): """个体姓名 """ _name = "%s-%s" % (self.name, uuid.uuid4()) _p = resource.Ps(_name) if _p.show_sex() == 'Male': self.male += 1 else: self.female += 1 self.P.append(_p) def time_scale(self, ts): """ 时标处理器 :param ts: 时标 :return: 存活个数,需求总量,男性个数,女性个数,怀孕人数 """ _alive = 0 _requirment = 0 _male = [] _female = [] self.mating = 0 for _p in self.P: if ts % 24 == 0: _requirment += _p.life_one_day() """收集满足交配条件的个人 """ if _p.can_mating("Male"): _male.append(_p) if _p.can_mating("Female"): _female.append(_p) else: _requirment += _p.show() if _p.alive(): _alive += 1 if len(_male) > 0 and len(_female) > 0: """若有满足交配条件的两性,则允许交配 """ for _m in _male: _m.mating('Male') for _f in _female: if _f.mating('Female'): self.mating += 0 for _p in self.P: if _p.is_mating(): self.mating += 1 if _p.birth(): """新生一代 """ _name = "%s-%s" % (self.name, uuid.uuid4()) _p = resource.Ps(_name) if _p.show_sex() == 'Male': self.male += 1 else: self.female += 1 self.P.append(_p) return _alive, _requirment, self.male, self.female, self.mating
def __init__(self, policy, dealer): super(BotPlayer, self).__init__() self.policy = Policy(policy) self.dealer = dealer
class FeaturesTestCase(unittest.TestCase): """ Tests for Policy class """ FEATURES_CUSTOM_CALLBACK = { 'statisticsFeature' : bool, 'evaluateByRecipientFeature' : bool, 'policiesByServerPoolFeature' : bool, 'messagesBySecFeature' : bool, 'messagesBySecStoreDays' : int, 'feederFeaturesEnabled' : bool, 'featuresByServerPool' : bool, 'learnFeature' : bool, 'learnPredictSafeValue' : float, 'learnOnlyOnce' : bool, 'learnEscalationValue' : float, 'learnBlockMinutes' : float, 'rateReputationFeature' : bool, 'rateReputationBlockHitValue' : int, 'rateReputationDecreaseValue' : float, 'rateReputationIncreaseMinutes' : float, 'ipReputationFeature' : bool, 'ipReputationHitValue' : int, 'ipReputationDecreaseValue' : float, 'ipReputationIncreaseMinutes' : float, 'subjectReputationFeature' : bool, 'subjectReputationHitValue' : int, 'subjectReputationDecreaseValue' : float, 'subjectReputationIncreaseMinutes' : float, 'global_custom_block' : float, } DEFAULT_POLICY_PARAMS = { 'enable' : 'TRUE', 'type' : 'regular', 'priority' : 5, 'jailby' : 'Sender:user@domain+', 'jailheader' : 'X-Themis-Quarantine', 'jailaction' : 'monitor', 'replydata' : 'Limit reached. Blocking for %s second(s)', 'countsentprobation' : 1, 'countrcpt' : 'FALSE', 'stophere' : 'FALSE', 'requestsmon' : 'FALSE', 'subjectprobation' : 0.5, 'ipprobation' : 0.5, 'blockprobation' : 0.5, 'countsentprobation' : 1 } METADATA_ITENS = { 'learnFeature' : True, 'learnPredictSafeValue' : 10, 'learnEscalationValue' : 1.0, 'learningBlueMode' : True, 'learningRedMode' : False, 'blue_creation_date' : time.time(), 'last_update' : 0, 'predictBy' : 'BLUE', 'ip_reputation_lastupdate' : 0, 'subject_lastupdate' : 0, 'last_subject' : 0, 'sentmessages_lastupdate' : 0, 'manual_block' : False, 'bypass' : True, 'subject_repeated_count' : 0, 'block_count' : 0 } def pairs_to_dict_typed(self, response, type_info): it = iter(response) result = {} for key, value in izip(it, it): if key in type_info: try: value = type_info[key](value) except: # if for some reason the value can't be coerced, just use # the string value pass result[key] = value return result def hgetall_custom_callback(self, response): data = dict(ThemisMetaData.METADATA_CUSTOM_CALLBACK.items() + self.FEATURES_CUSTOM_CALLBACK.items()) return response and self.pairs_to_dict_typed(response, data) or {} def init(self): with open('../config/config.yaml') as f: _, config_features, _ = yaml.load_all(f) self.features = config_features self.redis = redis.StrictRedis('localhost') #self.redis.flushdb() self.redis.set_response_callback('HGETALL', self.hgetall_custom_callback) self.policy = Policy(self.redis) self.add_default_policy() def add_default_policy(self): default_policy = { 'Source' : 'any', 'Destination' : 'any', 'JailSpec' : '1:1000' } for default_key, value in self.DEFAULT_POLICY_PARAMS.items(): if default_key not in default_policy.keys(): default_policy[default_key] = value default_policy['policy_name'] = 'default' pdata = PolicyData(**default_policy) try: self.policy.delete('default') except ValueError: pass self.policy.setpolicy(pdata) def test_store_global_features(self): """ This method tests a config file with several key features. It must be inserted to redis, then fetched back with the proper type. """ self.init() # convert global config features to Features object global_features = Features(**self.features) # store global config features in redis self.redis.hmset('config:themis:features', global_features.as_redis) # fetch from redis the global features global_features = Features(**self.redis.hgetall('config:themis:features')) # if got to here then it MUST be TRUE, the validation are made in the Features object self.assertTrue(True) def test_themis_metadata_override(self): self.init() """ This method checks if the metadata overrides the global features. """ tmetadata = ThemisMetaData(**self.METADATA_ITENS) # Custom features for ThemisMetaData tmetadata.messagesBySecStoreDays = 500 tmetadata.learnTimeFrameValue = '104:1000' tmetadata.ipReputationFeature = True tmetadata.learnOnlyOnce = False # Include all the global features in the ThemisMetaData, this will not override what is already set tmetadata.update_features(**Features(**self.features).as_dict) assertResult = False # Custom features MUST NOT be overrided for key, value in tmetadata.__dict__.items(): if key == 'messagesBySecStoreDays' and value == 500: assertResult = True elif key == 'learnTimeFrameValue' and value == '104:1000': assertResult = True elif key == 'ipReputationFeature' and value is True: assertResult = True elif key == 'learnOnlyOnce' and value is False: assertResult = True self.assertTrue(assertResult) def test_subject_feature_count(self): """ This method validates if the subject feature is counted properly, generating 3 requests with the same subject, then changing it to validate if the count is correct """ self.init() self.redis.delete('AI:metadata:[email protected]') for index in range(0, 5): metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' from_ipaddress = '189.10.21.1' if index == 4: milter_subject = 'Must NOT Update Subject Count, because subject title is different at last loop' else: milter_subject = 'Repeated Subject until index is 3' # Enable feeder feature feat.feederFeaturesEnabled = True feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) tmetadata = ThemisMetaData(**self.redis.hgetall('AI:metadata:[email protected]')) # Must be null because at the index 4 the subject is changed, so the namespace is deleted by this object score_milter_object = self.redis.zrange('AI:subjectReputationFeature:groupby', 0, -1, withscores=True) # Should assert True with 3 because at the index 4 I change the subject name self.assertTrue(tmetadata.subject_repeated_count == 3 and not score_milter_object) # NOTE: It is not necessary test the behavior of all the other features because has the same implementation def test_subject_feature_reset_time(self): """ This method validates if the subject feature is reseted properly. """ self.init() self.redis.delete('AI:metadata:[email protected]') for index in range(0, 3): metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' from_ipaddress = '189.10.21.1' milter_subject = 'Repeated Subject' # Enable feeder feature feat.feederFeaturesEnabled = True # 0.5 second in hour pdata.subjectprobation = 0.000138888889 feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) tmetadata = ThemisMetaData(**self.redis.hgetall('AI:metadata:[email protected]')) #print tmetadata.subject_repeated_count # If time sleep is 0.1 the subject_repeated_count will be two (2) time.sleep(0.5) # Must return 1 because the subject is repeated and always reseted to 1 score_milter_object = self.redis.zrange('AI:subjectReputationFeature:groupby', 0, -1, withscores=True)[0][1] self.assertTrue(tmetadata.subject_repeated_count == 1 and score_milter_object == 1) def test_ip_reputation_feature(self): """ This method validates if the ips are updated correctly, and also if the scores are with proper values """ self.init() self.redis.delete('AI:metadata:[email protected]') for from_ipaddress in ['189.20.3.1', '200.20.3.100', '200.20.3.100', '201.20.230.10']: metadata = self.redis.hgetall('AI:metadata:[email protected]') feat = Features(**self.features) if not metadata: # Here we feed redis for the first time with metadata. Will be executed only one once tmetadata = ThemisMetaData(**self.METADATA_ITENS) self.redis.hmset('AI:metadata:[email protected]', tmetadata.as_redis) # We update tmetadata.update_features(**feat.as_dict) else: tmetadata = ThemisMetaData(**metadata) policy = Policy(self.redis) pdata = policy.getpolicy('default') milter_object = '*****@*****.**' milter_subject = 'Subject does matter' # Enable feeder feature feat.feederFeaturesEnabled = True feat.tmetadata = tmetadata feat.call_feeders(self.redis, pdata, milter_object, from_ipaddress, milter_subject) score_object = self.redis.zrange(feat.ipReputationFeatureGroupByNamespace, 0, -1, withscores=True)[0][1] self.assertTrue(len(self.redis.smembers(feat.ipReputationFeatureNamespace)) == 3 and score_object == 3)
class TaskQueueManager(): def __init__(self, policy, number_of_tasks=None): self.__policy_list = Policy() self.__policy = policy self.__by_priority = any(self.__policy == pol for pol in self.__policy_list.uses_priority()) self.__setted_sort = self.__by_priority or policy == self.__policy_list.sjf self.__all = [] self.__ready = [] self.__finished = 0 self.__number_of_tasks = number_of_tasks def new_task(self, task): if type(task) == list: self.__all += task else: self.__all.append(task) def put_on_ready(self, task): index = self.__all.index(task) self.__all[index].status = Status.waiting self.__ready.append(index) # Sort ready list according with policy scheduling if self.__setted_sort: if self.__by_priority: self.__ready.sort(key=self.__sort_by_priority(self.__all), reverse=True) else: self.__ready.sort(key=self.__sort_by_duration(self.__all)) def put_on_finished(self, task): index = self.__all.index(task) self.__all[index].status = Status.finished self.__finished += 1 def placed_on_processor(self, task): index = self.__all.index(task) self.__all[index].status = Status.running try: self.__ready.remove(index) except: None def get_next_task(self, amount=1): task_to_run = [] for i in range(0, amount): if self.__ready: index = self.__ready.pop(0) self.__all[index].status = Status.running task_to_run.append(self.__all[index]) return task_to_run def there_task(self): if self.__number_of_tasks is None: return True else: return self.__finished < self.__number_of_tasks def increase_wait_time(self): if self.__policy == self.__policy_list.rrta: for i in self.__ready: self.__all[i].wait_time += 1 self.__all[i].priority += 1 # To Task Aging else: for i in self.__ready: self.__all[i].wait_time += 1 def get_task_list(self): return self.__all def get_ready_list(self): ready_list = [] for index in self.__ready: ready_list.append(self.__all[index]) return ready_list def __sort(self): 'Sort ready list according with policy scheduling' if self.__setted_sort: if self.__by_priority: self.__ready.sort(key=self.__sort_by_priority(self.__all), reverse=True) else: self.__ready.sort(key=self.__sort_by_duration(self.__all)) def __sort_by_priority(self, task_list): 'Used to sort "self.__ready" by priority of the items of "self.__all"' class K(object): def __init__(self, obj, *args): self.obj = obj def __lt__(self, other): return task_list[self.obj].priority < task_list[other.obj].priority def __gt__(self, other): return task_list[self.obj].priority > task_list[other.obj].priority return K def __sort_by_duration(self, task_list): 'Used to sort "self.__ready" by time_required of the items of "self.__all"' class K(object): def __init__(self, obj, *args): self.obj = obj def __lt__(self, other): return task_list[self.obj].time_required < task_list[other.obj].time_required def __gt__(self, other): return task_list[self.obj].time_required > task_list[other.obj].time_required return K
def __init__(self, relationship_graph_dict={}): """ Initializes the relationship graph""" #print relationship_graph_dict self.relationship_graph = Graph(relationship_graph_dict) self.policy = Policy()
class ReBAC(object): def __init__(self, relationship_graph_dict={}): """ Initializes the relationship graph""" #print relationship_graph_dict self.relationship_graph = Graph(relationship_graph_dict) self.policy = Policy() def user_list(self): """ returns the users of the relationship graph """ return self.relationship_graph.list_nodes() def relationships(self): """ returns the relationships among the users in the relationship graph """ return self.relationship_graph.list_edges() def create_user(self,username): """ If "username" does not exist in self.__relationship_graph_dict we need to add a key "username" should be added to the __relationship_graph_dict otherwise nothing has to be done""" if(self.relationship_graph.create_node(username)): print(username+" is created successfully") else: print(username+" already exists") def compute_all_paths(self, source_user, target_user, paths=[]): """returns all paths between two existing users""" return self.relationship_graph.find_all_paths(source_user, target_user, paths) def add_relationship(self, username1, username2): """ Add Relationship between two existing users, if the users doesn't exist through error""" if self.check_user_existence(username1) and self.check_user_existence(username2): if self.check_policy('add_relationship', username1, username2): if (self.relationship_graph.add_edge(username1, username2)): print("relationship successfully created between "+username1+" and "+username2) return True else: print("Policy doesn't authorize to create relationship between "+ username1+" and "+username2) return False else: return False def check_policy(self, action_type, source_user, target_user): """ check policies for different actions""" paths = self.compute_all_paths(source_user, target_user) return self.policy.check_policy(action_type, source_user, target_user, paths) def delete_relationship(self, username1, username2): """ Delete Relationship between two existing users, if any or both of the users doesn't exist through error""" if self.check_user_existence(username1) and self.check_user_existence(username2): if(self.relationship_graph.check_edge(username1, username2)): if self.check_policy("delete_relationship", username1, username2): if(self.relationship_graph.delete_edge(username1,username2)): print("Relationship successfully Deleted Between"+ username1+" and "+username2) return True else: print(username1+" is not authorized to delete relationship with " + username2) return False else: print(username1+" doesn't have any relationship with " + username2) return False else: return False def access(self, username1, username2): """ Check whether a user is allowed to access another user""" if self.check_user_existence(username1) and self.check_user_existence(username2): if self.check_policy("access", username1, username2): print(username1+" is allowed to access "+username2) return True else: print(username1+" is not authorized to access relationship with " + username2 ) return False else: return False def check_user_existence(self, username): """ Check Existence of a user""" if(self.relationship_graph.check_node_existence(username)): return True else: print("User doesn't Exist") return True