class MarkovModel(dict): """ Dictionary based nth order markov model """ def __init__(self, corpus=[], order=1): self.memory = Memory(order) # Adding states spliced before order allows model to loop to beginning once the end is reached when sampling for message in corpus + corpus[:order]: self.add_state(message) self.memory.clear() def add_state(self, new_state): """ Add a state to markov model and add new state to init_memory """ current_state = self.memory.serialize() if current_state in self: self[current_state].append(new_state) else: self[current_state] = [new_state] self.memory.enqueue(new_state) def sample(self, N=1, starting_state=tuple()): """ Return generator from sampling N times from markov model """ for state in starting_state: self.memory.enqueue(state) for _ in range(N): next_state = choice(self[starting_state]) self.memory.enqueue(next_state) yield next_state starting_state = self.memory.serialize() self.memory.clear()
class SpatialMemoryMachine(object): def __init__(self, dmemory, daddress, nstates, dinput, doutput, init_units, create_memories, influence_threshold, sigma): self.memory = Memory(dmemory, daddress, init_units, create_memories, influence_threshold, sigma) self.controller = Controller(dmemory, daddress, nstates, dinput, doutput) self.doutput = doutput self.read0 = np.random.randn(dmemory) def __call__(self, inputs): sequence_length = inputs.shape[0] self.read = self.read0 outputs = [] for t in range(sequence_length): address_r, address_w, erase, add, output = self.controller( inputs[t], self.read) #print address_r, address_w, erase, add, output self.memory.commit(address_w, erase, add) self.read = self.memory.fetch(address_r) outputs.append(output.reshape(1, -1)) return np.concatenate(outputs, axis=0) def loss(self, inputs, targets): inputs, targets = map(np.array, [inputs, targets]) outputs = self(inputs) ep = 2e-23 loss = -np.sum(targets * np.log2(outputs + ep) + (1 - targets) * np.log2(1 - outputs + ep)) return loss def clear(self): self.read = self.read0 self.memory.clear() self.controller.clear() def get_params(self): params = self.controller.get_params() params['read0'] = self.read0 return params def set_params(self, params): self.read0 = params['read0'] self.controller.set_params(params)
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None): print("PPO -- Training") env = make('hungry_geese') trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = PPOAgent(rows=11, columns=11, num_actions=3) memory = Memory() if load_model: agent.load_model_weights(actor_filename, critic_filename) agent.load_optimizer_weights(optimizer_filename) episode = 0 start_episode = 0 end_episode = 50000 reward_threshold = None threshold_reached = False epochs = 4 batch_size = 128 current_frame = 0 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: current_frame += 1 ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state, training=True) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) memory.add(state, action, reward, next_state, float(done)) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if current_frame % batch_size == 0: for _ in range(epochs): states, actions, rewards, next_states, dones = memory.get_all_samples() agent.fit(states, actions, rewards, next_states, dones) memory.clear() agent.update_networks() print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if reward_threshold: if len(last_1000_ep_reward) == 1000: if np.mean(last_1000_ep_reward) >= reward_threshold: print("You solved the task after" + str(episode) + "episodes") agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') threshold_reached = True break if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy') if threshold_reached: plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards) else: plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title("Reward") plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
class DCPU(object): HEX_OUTPUT_FORMAT = "%#06x" MAX_VAL = bitmask(specs.WORD_SIZE) def __init__(self): self.cycles_ran = 0 self.registers = Memory(specs.WORD_SIZE) self.reset_registers() self.RAM = RAM(specs.WORD_SIZE, specs.MAX_RAM_ADDRESS) self.basic_ops = { specs.BasicOperations.SET: self.set, specs.BasicOperations.ADD: self.add, specs.BasicOperations.SUB: self.subtract, specs.BasicOperations.MUL: self.multiply, specs.BasicOperations.DIV: self.divide, specs.BasicOperations.MOD: self.modulo, specs.BasicOperations.SHL: self.shift_left, specs.BasicOperations.SHR: self.shift_right, specs.BasicOperations.AND: lambda a, b: self.boolean_operation(operator.and_, a, b), specs.BasicOperations.BOR: lambda a, b: self.boolean_operation(operator.or_, a, b), specs.BasicOperations.XOR: lambda a, b: self.boolean_operation(operator.xor, a, b), specs.BasicOperations.IFE: lambda a, b: self.if_condition(operator.eq, a, b), specs.BasicOperations.IFN: lambda a, b: self.if_condition(operator.ne, a, b), specs.BasicOperations.IFG: lambda a, b: self.if_condition(operator.gt, a, b), specs.BasicOperations.IFB: lambda a, b: self.if_condition(operator.and_, a, b), } self.non_basic_ops = { specs.NonBasicOperations.JSR: self.jump_and_set_return, } def cycles(num_cycles): ''' Decorator used to specify the number of cycles taken by a function ''' def cycle_decorator(fn): def wrapper(self, *args, **kwargs): self.cycles_ran += num_cycles return fn(self, *args, **kwargs) return wrapper return cycle_decorator ''' Helper functions to access the special register values: SP, PC, 0 ''' @property def PC(self): return self.registers[specs.SPECIAL_REGISTER_NAMES['PC']] @PC.setter def PC(self, value): self.registers[specs.SPECIAL_REGISTER_NAMES['PC']] = value @property def SP(self): return self.registers[specs.SPECIAL_REGISTER_NAMES['SP']] @SP.setter def SP(self, value): self.registers[specs.SPECIAL_REGISTER_NAMES['SP']] = value @property def O(self): return self.registers[specs.SPECIAL_REGISTER_NAMES['O']] @O.setter def O(self, value): self.registers[specs.SPECIAL_REGISTER_NAMES['O']] = value def reset_registers(self): ''' Set all registers to default values ''' self.registers.clear() self.SP = specs.MAX_RAM_ADDRESS def reset(self): ''' Set CPU to clean state ''' self.cycles_ran = 0 self.reset_registers() self.RAM.clear() def load_program(self, program): ''' Load given instructions into RAM sequentially ''' self.reset() for i in range(len(program)): self.RAM[i] = read_instruction(program[i]) def run_program(self, program): ''' Runs the given program and detects any infinite loops ''' self.load_program(program) visited_states = set() while self.execute_next_instruction(): state = ("\n").join(self.get_state(show_cycles=False)) if state in visited_states: raise InfiniteLoopDetected() else: visited_states.add(state) def execute_next_instruction(self): ''' Main execution function, grabs the next instruction and executes it Stops if it ever reads STOP_INSTRUCTION ''' next_instruction = self.get_next_word() if next_instruction == specs.STOP_INSTRUCTION: return False (op_code, a, b) = parse_instruction(next_instruction) is_basic = (b is not None) op = self.get_op(op_code, is_basic) op(self.get_value(a), self.get_value(b)) if is_basic \ else op(self.get_value(a)) return True @cycles(1) def get_next_word(self): ''' Retrieves the word pointed to by PC and increments PC by 1 ''' next_word = self.RAM[self.PC] self.PC += 1 return next_word def get_op(self, op_code, is_basic): ''' Returns a function representing an implementation of the given op_code ''' ops = self.basic_ops if is_basic else self.non_basic_ops if op_code not in ops: raise OpCodeNotImplemented(op_code) return ops[op_code] def get_value(self, value_code): ''' Returns a Value instance with appropriate read/write functionality based on the given value_code ''' # These value codes read/write to a register if value_code in specs.REGISTERS.keys() + specs.SPECIAL_REGISTERS.keys(): def read(): return self.registers[value_code] def write(v): self.registers[value_code] = v # These value codes read/write to an address in RAM elif value_code <= 0x1e: # [register] if value_code <= 0x0f: address = self.registers[value_code - 0x08] # [next word + register] elif value_code <= 0x17: address = self.registers[value_code - 0x10] + self.get_next_word() # POP elif value_code == 0x18: address = self.SP self.SP += 1 # PEEK elif value_code == 0x19: address = self.SP # PUSH elif value_code == 0x1a: self.SP -= 1 address = self.SP # [next word] elif value_code == 0x1e: address = self.get_next_word() def read(): return self.RAM[address] def write(v): self.RAM[address] = v # These value codes represent literals elif value_code <= 0x3f: # next word (literal) if value_code == 0x1f: value = self.get_next_word() # literal value 0x00-0x1f else: value = (value_code - 0x20) def read(): return value def write(v): # Fail silently on trying to assign to a literal pass # Value codes > 0x3f are undefined else: raise InvalidValueCode(value_code) return Value(read, write) @cycles(1) def set(self, a, b): ''' Sets a to b ''' a.write(b.read()) @cycles(2) def add(self, a, b): ''' Sets a to a+b, sets O to 0x0001 if there's an overflow, 0x0 otherwise ''' result = a.read() + b.read() if result > self.MAX_VAL: self.O = 0x0001 else: self.O = 0x0 a.write(result) @cycles(2) def subtract(self, a, b): ''' Sets a to a-b, sets O to 0xffff if there's an underflow, 0x0 otherwise ''' a_value = a.read() b_value = b.read() if b_value > a_value: a_value += self.MAX_VAL self.O = 0xffff else: self.O = 0x0 result = a_value - b_value a.write(result) @cycles(2) def multiply(self, a, b): ''' Sets a to a*b, sets O to ((a*b)>>16)&0xffff ''' result = a.read() * b.read() self.O = (result >> specs.WORD_SIZE) a.write(result) @cycles(3) def divide(self, a, b): ''' Sets a to a/b, sets O to ((a<<16)/b)&0xffff. if b==0, sets a and O to 0 instead ''' a_value = a.read() b_value = b.read() if b_value == 0: self.O = 0 a.write(0) else: self.O = ((a_value << specs.WORD_SIZE)/b_value) a.write(a_value / b_value) @cycles(3) def modulo(self, a, b): ''' Sets a to a%b. if b==0, sets a to 0 instead ''' b_value = b.read() if b_value == 0: a.write(0) else: a.write(a.read() % b_value) @cycles(2) def shift_left(self, a, b): ''' Sets a to a<<b, sets O to ((a<<b)>>16)&0xffff ''' result = a.read() << b.read() a.write(result) self.O = (result >> 16) @cycles(2) def shift_right(self, a, b): ''' Sets a to a>>b, sets O to ((a<<16)>>b)&0xffff ''' a_value = a.read() b_value = b.read() a.write(a_value >> b_value) self.O = ((a_value << 16) >> b_value) @cycles(1) def boolean_operation(self, boolean_operator, a, b): ''' Sets a to a <boolean_operator> b ''' a.write(boolean_operator(a.read(), b.read())) @cycles(2) def if_condition(self, conditional, a, b): ''' Performs next instruction only if a <conditional> b ''' if not conditional(a.read(), b.read()): self.PC += get_word_length(self.RAM[self.PC]) self.cycles_ran += 1 @cycles(2) def jump_and_set_return(self, a): ''' Pushes the address of the next instruction to the stack, then sets PC to a ''' self.SP -= 1 self.RAM[self.SP] = self.PC self.PC = a.read() def get_state(self, show_cycles=True): state = [] if show_cycles: state.append("Ran %d cyles" % self.cycles_ran) state.append("") state.append("PC: " + self.HEX_OUTPUT_FORMAT % self.PC) state.append("SP: " + self.HEX_OUTPUT_FORMAT % self.SP) state.append("O: " + self.HEX_OUTPUT_FORMAT % self.O) state.append("") state.append("Register values") state.append("---------------") state.append('\n'.join([name + ": " + self.HEX_OUTPUT_FORMAT % self.registers[key] \ for (key, name) in sorted(specs.REGISTERS.iteritems())])) state.append("") state.append("Memory dump") state.append("-----------") state.append(str(self.RAM)) return state def __str__(self): return '\n'.join(self.get_state())
class Agent: """ class implements agent """ def __init__(self, state_size, action_size, args): self.args = args with open( os.path.dirname( os.path.abspath(inspect.getfile(inspect.currentframe()))) + '/agent_args.json') as f: data = json.load(f) self.initial_epsilon = int( data[self.args.environment]["initial_epsilon"]) self.final_epsilon = float( data[self.args.environment]["final_epsilon"]) self.current_epsilon = self.initial_epsilon self.epsilon_decay = float( data[self.args.environment]["epsilon_decay"]) self.gamma = float(data[self.args.environment]["gamma"]) self.minibatch_size = int( data[self.args.environment]["minibatch_size"]) self.learning_rate = float( data[self.args.environment]["learning_rate"]) self.fraction_update = float( data[self.args.environment]["fraction_update"]) self.loss = data[self.args.environment]["loss"] self.memory_type = self.args.memory self.memory_size = int(data[self.args.environment]["memory_size"]) if self.memory_type == "basic": self.memory = deque(maxlen=self.memory_size) else: self.memory = Memory(self.memory_size) self.action_size = action_size self.state_size = state_size if self.args.mdl_blueprint and not self.args.dont_save: self.mdl_blueprint = True else: self.mdl_blueprint = False network = Network(state_size, action_size, self.learning_rate, self.loss, [True, self.mdl_blueprint]) self.net_units = None if data[self.args.environment]["net_units"] != "None": self.net_units = [ int(i) for i in data[self.args.environment]["net_units"] ] self.model_type = self.args.network if self.model_type == "2layer_bsc_mdl": self.model_net = network.make_2layer_mdl(self.net_units) self.target_net = network.make_2layer_mdl(self.net_units) elif self.model_type == "2layer_duel_mdl": self.model_net = network.make_2layer_duel_mdl(self.net_units) self.target_net = network.make_2layer_duel_mdl(self.net_units) elif self.model_type == "bsc_img_mdl": self.model_net = network.make_bsc_img_mdl() self.target_net = network.make_bsc_img_mdl() elif self.model_type == "duel_img_model": self.model_net = network.make_duel_img_mdl() self.target_net = network.make_duel_img_mdl() elif self.model_type == "1layer_ram_mdl": self.model_net = network.make_1layer_mdl(self.net_units) self.target_net = network.make_1layer_mdl(self.net_units) self.update_target_net() self.algorithm = self.args.algorithm self.algorithms = { "DQN": self.train_dqn, "DQN+TN": self.train_target_dqn, "DDQN": self.train_ddqn, } def update_target_net(self): """ method updates target network """ self.target_net.set_weights(self.model_net.get_weights()) print("[Target network was updated.]") def update_target_net_partially(self): """ method updates target network by parts """ weights_model = self.model_net.get_weights() weights_target = self.target_net.get_weights() for i in range(len(weights_target)): weights_target[i] = weights_model[ i] * self.fraction_update + weights_target[i] * ( 1 - self.fraction_update) self.target_net.set_weights(weights_target) print("[Target network was updated by parts.]") def get_error(self, state, action, reward, next_state, done): """ method returns difference between Q-value from primary and target network """ q_value = self.model_net.predict(np.array([state])) ns_model_pred = self.model_net.predict(np.array([next_state])) ns_target_pred = self.target_net.predict(np.array([next_state])) obs_error = q_value[0][action] if done == 1: q_value[0][action] = reward else: q_value[0][action] = reward + self.gamma * ns_target_pred[0][ np.argmax(ns_model_pred)] obs_error = abs(obs_error - q_value[0][action]) return obs_error def remember(self, state, action, reward, next_state, done, rand_agent): """ method saves observation (experience) to experience replay memory """ if self.memory_type == "basic": self.memory.append((state, action, reward, next_state, done)) else: if rand_agent: obs_error = abs(reward) else: obs_error = self.get_error(state, action, reward, next_state, done) self.memory.add_observation( (state, action, reward, next_state, done), obs_error) def clear_memory(self): """ method clears replay memory """ self.memory.clear() def decrease_epsilon(self): """ method decreases epsilon """ if self.current_epsilon > self.final_epsilon: if (self.current_epsilon - self.epsilon_decay) > self.final_epsilon: self.current_epsilon = self.current_epsilon - self.epsilon_decay else: self.current_epsilon = self.final_epsilon def get_action(self, task, state, non_normalized_state, epsilon): """ method returns action to take """ if not epsilon: q_value = self.model_net.predict(np.array([state])) else: if np.random.rand() <= self.current_epsilon: if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: rand_action = np.random.randint(0, self.action_size, size=1)[0] if possible_actions[rand_action] == 1: return rand_action else: return np.random.randint(0, self.action_size, size=1)[0] else: q_value = self.model_net.predict(np.array([state])) if task.name == "2048-v0": possible_actions = possible_moves(non_normalized_state) while True: chosen_action = np.argmax(q_value) if possible_actions[chosen_action] == 1: return chosen_action else: q_value[0][chosen_action] = -100 return np.argmax(q_value) def get_minibatch(self): """ method returns minibatch from diffrent memory types """ if self.memory_type == "basic": minibatch = random.sample(list(self.memory), self.minibatch_size) state = np.array([i[0] for i in minibatch]) action = [i[1] for i in minibatch] reward = [i[2] for i in minibatch] next_state = np.array([i[3] for i in minibatch]) done = [i[4] for i in minibatch] else: minibatch = self.memory.sample(self.minibatch_size) state = np.array([i[1][0] for i in minibatch]) action = [i[1][1] for i in minibatch] reward = [i[1][2] for i in minibatch] next_state = np.array([i[1][3] for i in minibatch]) done = [i[1][4] for i in minibatch] return state, action, reward, next_state, done def train(self): """ method trains agent with selected algorithm """ self.algorithms[self.algorithm]() def train_dqn(self): """ method trains agent using DQN """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(np.array(state)) ns_model_pred = self.model_net.predict(np.array(next_state)) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[i]] = reward[i] + self.gamma * np.max( ns_model_pred[i]) errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def train_target_dqn(self): """ method trains agent using DQN with target network """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(np.array(state)) ns_target_pred = self.target_net.predict(np.array(next_state)) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[i]] = reward[i] + self.gamma * np.max( ns_target_pred[i]) errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def train_ddqn(self): """ method trains agent using DDQN """ if self.memory_type == "basic": if len(self.memory) >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return else: if self.memory.length >= self.minibatch_size: state, action, reward, next_state, done = self.get_minibatch() else: return errors = np.zeros(self.minibatch_size) possible_actions_curr = [] if self.args.environment == "2048-v0": for i, item in enumerate(state): possible_actions_curr.append(possible_moves(item)) state = state / 16384.0 - 0.5 next_state = next_state / 16384.0 - 0.5 q_value = self.model_net.predict(state) ns_model_pred = self.model_net.predict(next_state) ns_target_pred = self.target_net.predict(next_state) for i in range(0, self.minibatch_size): errors[i] = q_value[i][action[i]] if done[i] == 1: q_value[i][action[i]] = reward[i] else: q_value[i][action[ i]] = reward[i] + self.gamma * ns_target_pred[i][np.argmax( ns_model_pred[i])] errors[i] = abs(errors[i] - q_value[i][action[i]]) for i, item in enumerate(possible_actions_curr): for e, elem in enumerate(item): if elem == 0: q_value[i][e] = -1 self.model_net.fit(state, q_value, epochs=1, verbose=0) if self.memory_type == "dueling": self.memory.update_minibatch(minibatch, errors) def load_model_weights(self, name): """ method loads weights to primary neural network """ self.model_net.load_weights(name) print("[Model has been loaded from \"{}\".]".format(name)) def save_model_weights(self, name): """ method saves weights of primary neural network """ self.model_net.save_weights("./model-{}".format(name)) print("[Model was saved to \"./model-{}\".]".format(name)) def load_target_weights(self, name): """ method loads weights to target neural network """ self.target_net.load_weights(name) print("[Target model has been loaded from \"{}\".]".format(name)) def save_target_weights(self, name): """ method saves weights of target neural network """ self.target_net.save_weights("./target-{}".format(name)) print("[Target model was saved to \"./target-{}\".]".format(name))
for i in range(num_env): surprisals[i] = model.get_surprisal(sess, memory.states[i], memory.action_indexes[i], memory.tail_states[i]) normalized_advantages, rewards = rp.calc_normalized_advantages_and_rewards(memory.predicted_rewards, tail_predicted_rewards, surprisals) memory.save_advs_and_rews(normalized_advantages, rewards) print("advantage mean std", np.mean(normalized_advantages), np.std(normalized_advantages)) print("reward mean std", np.mean(rewards), np.std(rewards)) #save visualization metadata for x in range(num_env): env = driver.envs[x] if env.should_record(): ds_di = model.get_dsurprisal_dinps(sess, memory.states[x], memory.action_indexes[i], [prev_rollout_lastframes[x]]) da_di = model.get_daction_dinps(sess, memory.states[x], memory.action_indexes[x]) env.save_surprisals_and_grads_metadata(surprisals[x], ds_di, da_di) prev_rollout_lastframes = frames #end visualization stuff for _ in range(epoch_per_rollout): random_indexes = np.arange(num_env) np.random.shuffle(random_indexes) for start in range(0, num_env, mini_batch_size): end = start + mini_batch_size #python wont complain if end is above the max index of array[start:end] _states, _action_indexes, _action_probabilities, _next_states, _advantages, _rewards = memory.get_by_indexes(random_indexes[start:end]) summary = model.train(sess, _states, _action_indexes, _action_probabilities, _next_states, _advantages, _rewards) summary_recorder.add_summary(summary, epoch_no) epoch_no += 1 print(epoch_no, " finished") print("visited levels:", all_visited_levels) memory.clear() sys.stdout.flush()
class A2C(Agent): ''' This class defines an Agent which uses Q-learning with state-action network. Params of __init__: - env: Environment -- environment to use; - gamma: float -- discount factor; - learning_rate: float -- learning rate. - num_units: int -- number of units in layer - num_layers: int -- number of layers - update_frequency: int -- number of episodes per update ''' def __init__(self, env, gamma=0.99, lambd=0.7, learning_rate=0.1, num_units=1, num_layers=0, update_frequency=5): super(A2C, self).__init__(env) self.gamma = gamma self.lambd = lambd self.learning_rate = learning_rate self.update_frequency = update_frequency self.num_units = num_units self.num_layers = num_layers self.memory = Memory() tf.reset_default_graph() self.build() self.sess = tf.Session(config=get_tf_config()) self.sess.run(self.init) def build(self): ''' This function builds TF graph and all the ops belonging to it. As a result new members are acquired: - self.out: tensor [batch_size, action_shape] action or their logits - self._state: state placeholder - self._action: action placeholder - self._reward: reward placeholder - self.loss: loss tensor - self.update: train_op -- updates neural network using REINFORCE - self.init: all variables initializer ''' def num_or_shape(space): return space.n if isinstance(space, spaces.Discrete) else space.shape state_num_or_shape = num_or_shape(self.env.observation_space) action_num_or_shape = num_or_shape(self.env.action_space) self.actor_critic = FeedForwardActorCritic(self.num_layers, self.num_units, state_num_or_shape, action_num_or_shape) self._state = self.actor_critic.state self._action = self.actor_critic.action self._advantage = tf.placeholder(shape=[None], dtype=tf.float32) self._value_target = tf.placeholder(shape=[None], dtype=tf.float32) self.policy_loss = -tf.reduce_mean( self._advantage * self.actor_critic.log_probability) # self.policy_loss = -tf.reduce_mean((self._advantage - self.actor_critic.value_pred) * self.actor_critic.log_probability) self.value_loss = 0.5 * tf.reduce_mean( tf.squared_difference(self.actor_critic.value_pred, self._value_target)) self.loss = self.policy_loss + self.value_loss optimizer = tf.train.AdamOptimizer(self.learning_rate) self.update = optimizer.minimize(self.loss) self.init = tf.global_variables_initializer() def preprocess_state(self, observation): ''' This function does preprocessing for discrete observations. Params: - observation: State -- state to be preprocessed Returns: - out: State -- one-hot encoded state if discrete and the same state if not ''' if self.actor_critic.discrete_states: E = np.identity(self.actor_critic.num_states) observation = E[observation] return observation def observe(self, old_observation, action, new_observation, reward, done, value_pred=None): old_observation = self.preprocess_state(old_observation) new_observation = self.preprocess_state(new_observation) self.memory.insert(old_observation, action, new_observation, reward, done, value_pred) self.next_pred = self.sess.run( self.actor_critic.value_pred, feed_dict={self._state: new_observation}) def act(self, observation): observation = self.preprocess_state(observation) action, value = self.sess.run( [self.actor_critic.sample, self.actor_critic.value_pred], feed_dict={self._state: observation}) return action, value def episode_end(self): if (self.episode_num + 1) % self.update_frequency == 0: advantages, returns = self.memory.compute_advantages( self.gamma, self.lambd, self.next_pred) # returns = self.memory.compute_returns(self.gamma) states = self.memory.old_states.reshape( -1, *self.actor_critic.state_shape) actions = self.memory.actions.reshape( -1, *self.actor_critic.action_shape) self.sess.run(self.update, feed_dict={ self._state: states, self._action: actions, self._advantage: advantages.reshape(-1), self._value_target: returns.reshape(-1) }) # self.sess.run(self.update, # feed_dict={ # self._state: states, # self._action: actions, # self._advantage: returns.reshape(-1), # self._value_target: returns.reshape(-1) # }) self.memory.clear()
class Board: """ Board definition and handling """ @staticmethod def getList(): """ Return the list of boards described into the configuration file """ boardDir = Config.getBoardDir() boardFile = os.path.join(boardDir, "board_description.cfg") Board.config = Config(boardFile) boardList = Board.config.getItems("Boards") return boardList def __init__(self, boardName, display): """ initialize a board by loading its definition @param boardName: name of board in board description file @type boardName: string @param display: where to display board @type display: Windows """ self.display = display self.deviceModuleList = [] self.program = None self.boardHelp = None self.archHelp = None self.loadDefinition(boardName) def loadDefinition(self, boardName): """ load the board definition from the definition file @param boardName: name of board to look for in the description file @type boardName: string """ items = Board.config.getItems(boardName) self.deviceList = [] for item in items: name = item[0] value = item[1] if name == "arch": self.archName = value elif name == "memory": if (value == "max"): self.memorySize = -1 else: self.memorySize = int(value) elif name[0:6] == "device": device = shlex.split(value, "#") self.deviceList.append(device) elif name == "help": fileName = os.path.join(Config.getBoardDir(), value) if os.path.isfile(fileName): self.boardHelp = fileName def build(self): """ Build the board: . load its architecture: chip with registers, opcodes and so on . load its controller definition and initialize them . initialize its memory . attach controllers to their I/O addresses """ archDir = Config.getArchDir() archPath = os.path.join(archDir, "arch_" + self.archName + ".py") self.archModule = imp.load_source(self.archName, archPath) self.archHelp = os.path.join(archDir, "arch_" + self.archName + ".html") if not os.path.isfile(self.archHelp): self.archHelp = None self.chip = self.archModule.Chip() if (self.memorySize == -1): self.memorySize = 2 ** (self.chip.getAddressSize() * 8) self.controller = Controller(self.display) self.controller.loadDeviceList() self.memory = Memory(self, self.memorySize) self.memory.addController(self.controller) for device in self.deviceList: self.deviceModuleList.append(self.controller.createDevice(device)) def clear(self): """ Clear the board: reset memory and chip (PC, SP, other registers and so on) """ self.memory.clear() self.chip.clear() def delete(self): """ Delete the board: remove attached controllers """ self.controller.delete() def loadProgram(self, fileName): """ Load the program in memory @param filename: file containing the code to execute @type filename: string """ try: self.program = Program() self.program.load(fileName, self) self.chip.setEndProgram(False) return True except MemError as e: e.display() return False except ProgramError: return False
class Reinforce(Agent): ''' This class defines an Agent which uses Q-learning with state-action network. Params of __init__: - env: Environment -- environment to use; - gamma: float -- discount factor; - learning_rate: float -- learning rate. - num_units: int -- number of units in layer - num_layers: int -- number of layers - update_frequency: int -- number of episodes per update ''' def __init__(self, env, gamma=0.99, learning_rate=0.1, num_units=1, num_layers=0, update_frequency=5): super(Reinforce, self).__init__(env) self.gamma = gamma self.learning_rate = learning_rate self.update_frequency = update_frequency self.num_units = num_units self.num_layers = num_layers self.memory = Memory() tf.reset_default_graph() self.build() self.sess = tf.Session(config=get_tf_config()) self.sess.run(self.init) def build(self): ''' This function builds TF graph and all the ops belonging to it. As a result new members are acquired: - self.out: tensor [batch_size, action_shape] action or their logits - self._state: state placeholder - self._action: action placeholder - self._reward: reward placeholder - self.loss: loss tensor - self.update: train_op -- updates neural network using REINFORCE - self.init: all variables initializer ''' def num_or_shape(space): return space.n if isinstance(space, spaces.Discrete) else space.shape state_num_or_shape = num_or_shape(self.env.observation_space) action_num_or_shape = num_or_shape(self.env.action_space) self.policy = FeedForwardPolicy(self.num_layers, self.num_units, state_num_or_shape, action_num_or_shape) self._state = self.policy.state self._action = self.policy.action self._reward = tf.placeholder(shape=[None], dtype=tf.float32) self.loss = -tf.reduce_mean(self._reward * self.policy.log_probability) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.update = optimizer.minimize(self.loss) self.init = tf.global_variables_initializer() def preprocess_state(self, observation): ''' This function does preprocessing for discrete observations. Params: - observation: State -- state to be preprocessed Returns: - out: State -- one-hot encoded state if discrete and the same state if not ''' if self.policy.discrete_states: observation = _one_hot(observation, self.num_states) return observation def observe(self, old_observation, action, new_observation, reward, done): old_observation = self.preprocess_state(old_observation) new_observation = self.preprocess_state(new_observation) self.memory.insert(old_observation, action, new_observation, reward, done) if done and (self.episode_num + 1) % self.update_frequency == 0: discounted_rewards = self.memory.compute_returns(self.gamma) self.sess.run(self.update, feed_dict={ self._state: self.memory.old_states, self._action: self.memory.actions, self._reward: discounted_rewards }) def act(self, observation): observation = self.preprocess_state(observation) return self.sess.run(self.policy.sample, feed_dict={self._state: [observation]})[0] def episode_end(self): if (self.episode_num + 1) % self.update_frequency == 0: self.memory.clear()
class DQN(): def __init__(self, n_features, n_actions, hidden_layers, lr=0.001, gamma=0.99, experience_limit=None): self.n_features = n_features self.n_actions = n_actions self.nn = NeuralNetwork(n_features, n_actions, hidden_layers, lr) # memory of episodes self.experience = Memory(maxlen=experience_limit) # `action` selection algorithm parameters self.explore_start = 1.0 self.explore_stop = 0.1 self.decay_rate = 0.0001 # hyperparameters self.lr = lr self.gamma = gamma def action_values(self, states, action=None): output = self.nn.nn_output(states) if action != None: output = output[action] return output def best_action(self, state): state = np.array(state) matrix_form = state.reshape((1, *state.shape)) output = self.action_values(matrix_form)[0] action = np.argmax(output) return action def next_action(self, state, n_episodes=0): explore_p = self.explore_stop + (self.explore_start - self.explore_stop)*np.exp(-self.decay_rate*n_episodes) if np.random.rand() < explore_p: # should go to explore action = np.random.choice(self.n_actions) else: action = self.best_action(state) return action def fill_experience(self, exp): self.experience.add(exp) def extend_experience(self, exp): self.experience.extend(exp) def clear_experience(self): self.experience.clear() def train_batch_states(self, batch_size): batch = self.experience.sample(batch_size) states = np.array([step[0] for step in batch]) actions = np.array([step[1] for step in batch]) rewards = np.array([step[2] for step in batch]) next_states = np.array([step[3] for step in batch]) ends = np.array([step[4] for step in batch]) # query NN to get action-values action_values = self.action_values(next_states) # if it is `terminal` point, set its action-value to 0 action_values[ends] = (0, ) * self.n_actions targets = rewards + self.gamma * np.max(action_values, axis=1) # training ... feed = {self.nn.inputs__: states, self.nn.actions__: actions, self.nn.targets__: targets} loss, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed) return loss def train_an_episode(self, episode): states = np.array([step[0] for step in episode]) actions = np.array([step[1] for step in episode]) rewards = np.array([step[2] for step in episode]) next_states = np.array([step[3] for step in episode]) ends = np.array([step[4] for step in episode]) action_values = self.action_values(next_states) # if it is `terminal` point, set its action-value to 0 action_values[ends] = (0, ) * self.n_actions targets = rewards + self.gamma * np.max(action_values, axis=1) # training ... feed = {self.nn.inputs__: states, self.nn.actions__: actions, self.nn.targets__: targets} loss, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed) return loss def train_multi_episodes(self, episodes): all_states = None all_actions = None all_targets = None for episode in episodes: states = np.array([step[0] for step in episode]) actions = np.array([step[1] for step in episode]) rewards = np.array([step[2] for step in episode]) next_states = np.array([step[3] for step in episode]) action_values = self.action_values(next_states) # the last one is `terminal` point, mark it using 0 action_values[-1] = (0, ) * self.n_actions targets = rewards + self.gamma * np.max(action_values, axis=1) if all_states is None: all_states = states all_actions = actions all_targets = targets else: # concatenate all_states = np.concatenate((all_states, states)) all_actions = np.concatenate((all_actions, actions)) all_targets = np.concatenate((all_targets, targets)) # training batch ... feed = {self.nn.inputs__: all_states, self.nn.actions__: all_actions, self.nn.targets__: all_targets} losses, _ = self.nn.sess.run([self.nn.loss, self.nn.opt], feed_dict=feed) return losses def learn_from_experience(self, batch_size): losses = 0 batch = self.experience.sample(batch_size) #for episode in batch: #losses += self.train_an_episode(episode) losses = self.train_batch(batch) return losses