def generate_random_epochs(learntAgents=False, save_front_json=False, epochs=range(1)): reshaping = True cars_outs = [] rewards = [] rewards_mean = [] if learntAgents: agents: List[SmartAgent] = get_LearnSmartAgents() else: agents: List[SmartAgent] = get_SmartAgents() for agent in agents: agent.memories = [] for e in epochs: Globals().epsilon = 1 env: Env = epoch(agents, u=Globals().get_u(Globals().vp.max_time_learn), time=Globals().vp.max_time_learn) for agent in env.agents: agent.reshape_rewards() if save_front_json: exportData = ExportData(learningMethod='DQN', learningEpochs=0, nets=env.global_memories, netName='politechnika', densityName='random_now' + str(Globals().greedy_run_no)) exportData.saveToJson() env.remember_memory() save_batches(agents) return agents
def epoch_greedy(env): Globals().time = 0 Globals().epsilon = 0 for t in range(max_time): actions: List[ActionInt] = [agent.get_action(agent.local_state) for agent in env.agents] env.step(actions) return env
def full_batch(self, only_learn_usable=False): x_batch = [] y_batch = [] i = 0 l_rate = Globals().learning_rate gamma = Globals().vp().gamma memories = self.memories if not only_learn_usable else [ mem for mem in self.memories if mem.learn_usable ] for memory in memories: state = memory.state.to_learn_array() action = 2 if memory.action == 'orange' else memory.action y_target = self.model.predict(state) new_state_possible_actions_value_predictions = self.model.predict( memory.new_state.to_learn_array()) if memory.action == 1 and memory.new_state.actual_phase == 'orange': print('akcja 1, nastepnej akcji value', new_state_possible_actions_value_predictions) print('') max_next_action_value = max( new_state_possible_actions_value_predictions[0] ) if memory.state.starting_actual_phase != 'orange' else new_state_possible_actions_value_predictions[ 0][-1] target = (1 - l_rate) * y_target[0][action] + l_rate * ( memory.reward + gamma * max_next_action_value) i += 1 y_target[0][action] = target x_batch.append(state[0]) y_batch.append(y_target[0]) return x_batch, y_batch
def epoch_greedy(env) -> Env: Globals().time = 0 actions_count_0 = [0, 0, 0, 0, 0] actions_count_1 = [0, 0, 0, 0, 0] actions_count_2 = [0, 0, 0, 0, 0] actions_count_3 = [0, 0, 0, 0, 0] for t in range(Globals().vp.max_time_greedy): actions: List[ActionInt] = [ agent.get_action(state=agent.local_state, greedy=True) for agent in env.agents ] env.step(actions) if actions[0] != yellow: actions_count_0[int(actions[0])] += 1 else: actions_count_0[-1] += 1 if actions[1] != yellow: actions_count_1[int(actions[1])] += 1 else: actions_count_1[-1] += 1 if actions[2] != yellow: actions_count_2[int(actions[2])] += 1 else: actions_count_2[-1] += 1 if actions[3] != yellow: actions_count_3[int(actions[3])] += 1 else: actions_count_3[-1] += 1 return env.u print('akcje podjete', [actions_count_0, actions_count_1, actions_count_2, actions_count_3])
def test_no_5_reshaping_pass_action_1_1_1_long_time_then_2_2_2_long_time_then_3_3_3( self): # TESTUJEMY: rewardy max_time = 90 agents = get_SmartAgents() Globals().time = 0 Globals().gamma = 0 env = Env(agents) env.u = env_settings.u_all_2 for t in range(max_time): # actions = [1, 1, 1] if t < 60 elif 60 == t [2, 2, 2] actions = [1, 1, 1] if t == 60 or 63 <= t < 70: actions = [2, 2, 2] if t == 61 or t == 62: actions = [0, 0, 0] if t == 70 or t >= 73: actions = [3, 3, 3] if t == 71 or t == 72: actions = [0, 0, 0] env.step(actions) env.agents[0].save_batch() env.agents[0].reshape_rewards() self.assertAlmostEqual(env.agents[0].memories[60].reward, 23.3, 0) self.assertAlmostEqual(env.agents[0].memories[61].reward, 37, 0) self.assertAlmostEqual(env.agents[0].memories[62].reward, 39.1, 0) env.update_global_memory_rewards() exportData = ExportData(learningMethod='Monte Carlo TODO', learningEpochs=0, nets=env.global_memories, netName='net4', densityName='test_no_5') exportData.saveToJson()
def run_learnt_greedy(saveJson=True): Globals().cars_out_memory = [] model_file_names = [ 'static_files/model-agent0.h5', 'static_files/model-agent1.h5', 'static_files/model-agent2.h5', 'static_files/model-agent3.h5' ] agents = get_LearnSmartAgents(model_file_names) env = Env(agents) u = epoch_greedy(env) rewards_sum, rewards_mean = count_rewards(env) cars_out = env.cars_out if saveJson: exportData = ExportData(learningMethod='DQN', learningEpochs=0, nets=env.global_memories, netName='env4', densityName='learnt_' + str(Globals().greedy_run_no)) exportData.saveToJson() maximum_possible_cars_out = Globals().u_value * Globals( ).vp.max_time_greedy * 8 cars_out_percentage = round(100 * cars_out / maximum_possible_cars_out, 2) print( f'gready run {Globals().greedy_run_no} - rewards_mean:{round(rewards_mean, 2)} rewar' f'ds_sum:{round(rewards_sum, 0)}. Do układu wjechało {round(sum(sum(u)), 0)} pojazdów.' f' Wyjechało {round(cars_out, 0)}. Układ opuściło pr' f'ocentowo pojazdów:{cars_out_percentage}') Globals().greedy_run_no += 1 return rewards_mean, rewards_sum, cars_out, agents, sum( sum(u)), cars_out_percentage
def test_no_2_pass_action_0_long_time_then_1(self): # TESTUJEMY: zmiana faz max_time = 90 agents = get_SmartAgents() for agent in agents: agent.yellow_phase_duration = 2 Globals().time = 0 env = Env(agents) Globals().u_value = 2 env.u = Globals().get_u(max_time) env.yellow_phase_duration = 2 for t in range(max_time): actions = [0, 0, 0] if t == 60 or t > 62: actions = [1, 1, 1] if t == 61 or t == 62: actions = [yellow, yellow, yellow] time = Globals().time # time = t env.step(actions) time = Globals().time # time = t + 1 if t in range(3, 60): self.assertEqual([agent.actual_phase for agent in agents], [0, 0, 0]) if t == 60 or t == 61: self.assertEqual([agent.actual_phase for agent in agents], [yellow, yellow, yellow]) if t >= 62: self.assertEqual([agent.actual_phase for agent in agents], [1, 1, 1]) exportData = ExportData(learningMethod='Nothing', learningEpochs=0, nets=env.global_memories, netName='net14', densityName='test_no_2') exportData.saveToJson()
def run_learnt_greedy(saveJson=True): model_file_names = [ 'static_files/model-agent0.h5', 'static_files/model-agent1.h5', 'static_files/model-agent2.h5' ] agents = get_LearnSmartAgents(model_file_names) env = Env(agents) epoch_greedy(env) rewards_sum, rewards_mean = count_rewards(env) cars_out = env.cars_out if saveJson: exportData = ExportData(learningMethod='DQN', learningEpochs=0, nets=env.global_memories, netName='env3', densityName='learnt_' + str(Globals().greedy_run_no)) exportData.saveToJson() maximum_possible_cars_out = Globals().u_value * Globals().vp( ).max_time_greedy * 3 print( f'gready run {Globals().greedy_run_no} - rewards_mean:{round(rewards_mean, 2)} rewards_sum:{round(rewards_sum,0)} cars_out:{round(cars_out, 0)} układ opuściło procentowo pojazdów:{cars_out / maximum_possible_cars_out}' ) Globals().greedy_run_no += 1 return rewards_mean, rewards_sum, cars_out, agents
def save_batch(self): gamma = Globals().gamma batch_size = Globals().batch_size # minibatch = random.sample(self.memories, batch_size) x_batch = [] y_batch = [] i = 0 for memory in self.memories: if memory.action == 0: continue i += 1 state = memory.state.to_9_densities_learn_array() new_state = memory.new_state.to_9_densities_learn_array() y = self.model.predict(state) future_actions_values_predictions = self.model.predict(new_state) possible_actions = memory.state.possible_actions( self.orange_phase_duration) best_possible_future_action_value = np.amax([ future_actions_values_predictions[0][i] for i in possible_actions ]) target_action = ( memory.reward + gamma * # (target) = reward + (discount rate gamma) * best_possible_future_action_value ) # (maximum target Q based on future action a') # so this is the q value for action made in state leading to new_state # counted basing on - reward and reward of future best action y[0][memory.action] = target_action x_batch.append(state[0]) y_batch.append(y[0]) if self.index == 0: Globals().x_batch.append(state[0]) Globals().y_batch.append(y[0])
def plot_pred_memory(name=str(Globals().run_no)): plt.plot([pred[0][0] for pred in Globals().pred_plot_memory], color='red', label='0') plt.plot([pred[0][1] for pred in Globals().pred_plot_memory], color='green', label='1') plt.legend() plt.title('Nagrody przewidziane dla akcji podjętych podczas monitorowanego stanu') plt.savefig('plot' + name + '.png') plt.close()
def train(learntAgents=True, max_time_learn=20): if not learntAgents: agents = get_SmartAgents() else: agents = get_LearnSmartAgents() models = [agent.model for agent in agents] batches = get_batches() start_time = timer() x_batch = batches[0]['x_batch'] y_batch = batches[0]['y_batch'] model = models[0] val_loss = 5000 escape_flag = False while timer() - start_time < max_time_learn and not escape_flag: res = model.fit(x_batch, y_batch, batch_size=100, epochs=1, verbose=0, validation_split=0.2) if res.history['val_loss'][-1] > val_loss: escape_flag = True loss = res.history['val_loss'][-1] print(f'wynik sieci: {loss} straty') val_loss = 5000 else: val_loss = res.history['val_loss'][-1] x = [4, 20] pred = model.predict(np.array([x])) Globals().pred_plot_memory.append(pred) model.save('static_files/model-agent' + str(0) + '.h5') plt.plot([pred[0][0] for pred in Globals().pred_plot_memory], color='red', label='0') plt.plot([pred[0][1] for pred in Globals().pred_plot_memory], color='green', label='1') plt.legend() plt.title('Nagrody przewidziane dla akcji podjętych \n podczas monitorowanego stanu [4, 20]') plt.savefig('images_generated/state_predictions.png') plt.close()
def save_batch(self): gamma = Globals().gamma batch_size = Globals().batch_size # minibatch = random.sample(self.memories, batch_size) x_batch = [] y_batch = [] i = 0 for memory in self.memories: if memory.action == 'orange': # if memory.action == 'orange' or memory.state.starting_actual_phase == 'orange': continue i += 1 print('i',i) state = memory.state.to_9_densities_learn_array() print('state', state) print('memory',memory) y = self.model.predict(state) target_action = memory.reward # target_action = (memory.reward + gamma * # (target) = reward + (discount rate gamma) * # best_possible_future_action_value) # (maximum target Q based on future action a') # so this is the q value for action made in state leading to new_state # counted basing on - reward and reward of future best action y[0][memory.action] = target_action x_batch.append(state[0]) y_batch.append(y[0]) if self.index == 0: Globals().x_batch.append(state[0]) Globals().y_batch.append(y[0])
def __attrs_post_init__(self): Globals().time = 0 max_time = Globals().vp.max_time_greedy self.u = Globals().get_u(max_time) self.A = [] self.cars_out = 0 self.assign_local_states_to_agents()
def epoch_random(env) -> Env: Globals().epsilon = 0 agents: List[SmartAgent] = get_SmartAgents() for t in range(Globals().vp().max_time_learn): actions: List[ActionInt] = [random.choice(agent.local_action_space) for agent in agents] env.step(actions) return Env
def test_no_5_reshaping_pass_action_0_1_0(self): # TESTUJEMY: rewardy max_time = 90 agents = get_SmartAgents() Globals().time = 0 Globals().gamma = 0 env = Env(agents) env.u = env_settings.u_all_9 for t in range(max_time): # actions = [0,0,0] if t < 60 elif 60 == t [1,1,1] actions = [0] if t == 1 or t == 2: actions = [orange] if t == 60 or 63 <= t < 70: actions = [1] if t == 61 or t == 62: actions = [orange] if t == 70 or t >= 73: actions = [0] if t == 71 or t == 72: actions = [orange] env.step(actions) env.agents[0].save_batch() env.agents[0].reshape_rewards() # self.assertAlmostEqual(env.agents[0].memories[60].reward, 2, 0) # self.assertAlmostEqual(env.agents[0].memories[61].reward, 4, 0) # self.assertAlmostEqual(env.agents[0].memories[62].reward, 6, 0) # env.update_memory_rewards() exportData = ExportData(learningMethod='Monte Carlo TODO', learningEpochs=0, nets=env.global_memories, netName='net11', densityName='test_no_6') exportData.saveToJson()
def save_motions(self): old_time = Globals().time - 1 new_time = Globals().time self.last_flows = [] for agent in self.agents: actual_moves = ( ) if agent.actual_phase == 'orange' else agent.moves[ agent.actual_phase] for move in actual_moves: if move[0] == 404: continue A_cell = self.A[Globals().time - 1][move] section_from_index = move[1] previous_density = self.x[self.t - 1][section_from_index] value = A_cell * previous_density flow = { 'agent_index': agent.index, 'old_time': old_time, 'new_time': new_time, 'move': move, 'value': value } self.last_flows.append(flow) for flow in self.last_flows: self.flow_memories.append(flow)
def train(learntAgents=True, max_time_learn=20): l_rate = 0.0001 layers = [15, 25, 20, 15] optimizer = 'relu' regularizers_ = [0.2, 0.2, 0.2] print('train learntAgents', learntAgents) agents = get_LearnSmartAgents() # create_model(layers, optimizer, l_rate) # for i in range(3) models = [agent.model for agent in agents] batches = get_batches() # for i in range(len(models)): for i in range(3): start_time = timer() x_batch = batches[i]['x_batch'] y_batch = batches[i]['y_batch'] model = models[i] x2 = [] y2 = [] val_loss = 5000 escape_flag = False while timer() - start_time < max_time_learn and not escape_flag: res = model.fit(x_batch, y_batch, batch_size=100, epochs=1, verbose=0, validation_split=0.2) if res.history['val_loss'][-1] > val_loss: escape_flag = True print('wynik sieci', res.history['val_loss'][-1]) val_loss = 5000 else: val_loss = res.history['val_loss'][-1] # res = model.fit(np.array(x2), np.array(y2), batch_size=20, epochs=1, verbose=0) if i == 0: # x = [7, 10, 10] + [10, 10, 20] + [6, 5, 4] + [2] x = [4, 4, 62] + [10, 10, 49] + [0, 10, 10] + [0] pred = model.predict(np.array([x])) Globals().pred_plot_memory.append(pred) # model.evaluate(np.array(x2), np.array(y2)) model.save('static_files/model-agent' + str(i) + '.h5') if i == 0: plt.plot([pred[0][0] for pred in Globals().pred_plot_memory], color='red', label='0') plt.plot([pred[0][1] for pred in Globals().pred_plot_memory], color='green', label='1') plt.plot([pred[0][2] for pred in Globals().pred_plot_memory], color='blue', label='2') plt.legend() plt.title( 'Nagrody przewidziane dla akcji podjętych podczas monitorowanego stanu' ) plt.savefig('foo' + str(Globals().run_no) + '.png') plt.close()
def remember(self, densities, reward): state = self.local_state action = self.action self.assign_local_state(densities) new_state = self.local_state times = {'old': Globals().time - 1, 'new': Globals().time} memory = Memory(state=state, action=action, new_state=new_state, reward=reward, times=times) self.memories.append(memory)
def __attrs_post_init__(self): if self.model == 0: l_rate = Globals().vp().nn_l_rate layers = Globals().vp().layers activation = 'relu' self.model = self._build_model(layers=layers, activation=activation, l_rate=l_rate)
def epoch(): Globals().time = 0 env = Env(agents) for t in range(max_time): actions: List[ActionInt] = getActions(t) # actions = best_actions[t] if len(best_actions)>=t else [0,0,0] env.step(actions) Globals().epochs_done += 1 return env
def epoch(agents): Globals().time = 0 env = Env(agents) for t in range(max_time): actions: List[ActionInt] = [random.choice(agent.local_action_space) for agent in agents] # actions = best_actions[t] if len(best_actions)>=t else [0,0,0] env.step(actions) Globals().epochs_done += 1 return env
def epoch(agents, u=env_settings.u_all_2): Globals().time = 0 env = Env(agents) env.u = u for t in range(max_time): actions: List[ActionInt] = [agent.get_action(agent.local_state) for agent in agents] env.step(actions) Globals().epochs_done += 1 return env
def epoch_random(agents, u=Globals().u): Globals().time = 0 env = Env(agents) env.u = u for t in range(max_time): actions: List[ActionInt] = [agent.get_action(state=agent.local_state,full_random=True) for agent in agents] env.step(actions) Globals().epochs_done += 1 return env
def __attrs_post_init__(self): self.weights_history_callback = LambdaCallback( on_epoch_end=self.add_weight_history) if self.model == 0: l_rate = Globals().vp().nn_l_rate layers = Globals().vp().layers activation = 'relu' self.model = self._build_model(layers=layers, activation=activation, l_rate=l_rate)
def add_returns(self, G): for i in range(len(G)): if ((self.epoch_local_state_storage[i], self.epoch_local_action_storage[i]) in self.returns.keys()): self.returns[self.epoch_local_state_storage[i], self.epoch_local_action_storage[i]].append(G[i]) Globals().state_repeats += 1 else: Globals().new_states += 1 self.returns[self.epoch_local_state_storage[i], self.epoch_local_action_storage[i]] = [G[i]]
def remember_global_memory(self): times = Times(old_time=Globals().time - 1, new_time=Globals().time) actions = [agent.action for agent in self.agents] rewards = self.global_rewards[self.t - 1] densities = self.x[self.t - 1] lights = self.A[self.t-1] net = Net(times=times, densities=densities, rewards=rewards, actions=actions, lights=lights) self.global_memories.append(net)
def save_batch(self): x_batch = [] y_batch = [] for memory in self.memories: if memory.action == 'orange': continue state = memory.state.to_learn_array() y = memory.reward x_batch.append(state[0]) y_batch.append(y) if self.index == 0: Globals().x_batch.append(state[0]) Globals().y_batch.append(y)
def epoch(agents, time, u=None): if u is None: u = Globals().get_u(time) Globals().time = 0 env = Env(agents) env.u = u for t in range(time): actions: List[ActionInt] = [agent.get_action(agent.local_state) for agent in agents] if actions[0] != yellow: Globals().actions_memory[int(actions[0])] += 1 env.step(actions) Globals().epochs_learn_done += 1 return env
def get_action(self, state): if state.to_learn_tuple_used()[-1] == 'orange' and Globals().time != 0: return 'orange' s = state.to_learn_tuple_used() if random.random() < Globals().epsilon: random_action = random.choice([0, 1]) return random_action if s not in self.Pi: self.Pi[s] = random.choice([0, 1]) # print(f'time {Globals().time} wylosowane a: {self.Pi[s]}') # else: # print(f'time {Globals().time} mamy a: {self.Pi[s]}') return self.Pi[s]
def plot_pred_memory(no): plt.plot([pred[0][0] for pred in Globals().pred_plot_memory], color='red', label='0') plt.plot([pred[0][1] for pred in Globals().pred_plot_memory], color='green', label='1') plt.legend() plt.title( 'Nagrody przewidziane dla akcji podjętych podczas monitorowanego stanu' ) plt.savefig('images_generated/rewards_' + no + '.png') plt.close()