def get_best_action(self, sess, state, actionable_nodes, actions_vector, sample_idx=0): if self.include_partial_solution: features_per_graph = [np.copy(state.feature_matrix)] else: features_per_graph = [state.feature_matrix.transpose()[:self.num_features].transpose()] nodes_mask_per_graph = [1] if self.variable_support: all_nodes = np.arange(len(state.feature_matrix)) actioned_or_actionable = np.concatenate((state.partial_solution_node_indexes, actionable_nodes)) all_non_considered_nodes = np.setxor1d(all_nodes, actioned_or_actionable) constrained_adj = sp.csr_matrix(self.undirected_adj) constrained_adj = zero_rows(self.sparse_undirected_adj, all_non_considered_nodes) constrained_adj = zero_columns(constrained_adj, all_non_considered_nodes) constrained_support = preprocess_adj(constrained_adj.tocoo()) support_per_graph = [constrained_support] else: support_per_graph = [self.sparse_constant_support] if self.zero_non_included_nodes: # zero all (non-actioned or non-actionable nodes) features - we do this so that these nodes have no affect on the actioned nodes (that we care about) during convolution all_nodes = np.arange(len(state.feature_matrix)) actioned_or_actionable = np.concatenate((state.partial_solution_node_indexes, actionable_nodes)) all_non_considered_nodes = np.setxor1d(all_nodes, actioned_or_actionable) features_per_graph[0][all_non_considered_nodes] = np.zeros(self.num_features, np.float32) feed = self.construct_masked_feed_dict(self.placeholders, features_per_graph, support_per_graph, FLAGS.num_simultaneous_graphs, len(actions_vector), nodes_mask_per_graph=nodes_mask_per_graph) #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() #probabilities = sess.run([self.model.masked_prediction_op],feed_dict=feed, options=run_options, run_metadata=run_metadata)[0] #tl = timeline.Timeline(run_metadata.step_stats) #ctf = tl.generate_chrome_trace_format() #with open('get_best_action_timeline.json', 'w') as f: # f.write(ctf) if logging.getLogger().getEffectiveLevel() == logging.DEBUG: probabilities = sess.run([self.model.prediction_op,self.model.pred_print_ops],feed_dict=feed)[0] else: probabilities = sess.run([self.model.prediction_op],feed_dict=feed)[0] logging.info("Probability map across actions for sample " + str(sample_idx) + " was: " + str(":".join(list(map(str,probabilities))))) selected_node_action = np.random.choice(range(probabilities.size), p=probabilities.ravel()) node_idx, allocation = np.unravel_index(selected_node_action, probabilities.shape) action = Action() action.node_idx = actionable_nodes[node_idx] action.label = allocation return action, probabilities[node_idx][allocation]
def get_best_actions(self, sess, state_per_transition, actionable_nodes_per_transition, actions_vector): logging.debug("There are " + str(len(state_per_transition)) + " states per transitions.") features_per_graph = [state.feature_matrix for state in state_per_transition] nodes_mask_per_graph = [actionable_node_indexes for actionable_node_indexes in actionable_nodes_per_transition] logging.debug("Getting best actions") feed = construct_masked_feed_dict(self.placeholders, features_per_graph, self.support, FLAGS.num_simultaneous_graphs, len(actions_vector), nodes_mask_per_graph=nodes_mask_per_graph) rewards = sess.run([self.model.masked_prediction_op],feed_dict=feed)[0] best_future_action_per_transition = [] best_future_reward_per_transition = [] # TODO should be general to output_dim, not just cpu/gpu num_nodes_analyzed = 0 for graph_idx in range(len(state_per_transition)): rewards_for_transition = rewards[num_nodes_analyzed:num_nodes_analyzed+len(actionable_nodes_per_transition[graph_idx])] # length of this vector == actionable_nodes_per_transition[graph_idx] cpu_rewards = rewards_for_transition.transpose()[0].transpose() gpu_rewards = rewards_for_transition.transpose()[1].transpose() max_cpu_reward = np.amax(cpu_rewards) max_gpu_reward = np.amax(gpu_rewards) best_cpu_index = np.random.choice(np.argwhere(cpu_rewards == max_cpu_reward).flatten(),1)[0] best_gpu_index = np.random.choice(np.argwhere(gpu_rewards == max_gpu_reward).flatten(),1)[0] action = Action() if max_cpu_reward == max_gpu_reward: allocation = np.random.choice(actions_vector,1)[0] action.node_idx = [actionable_nodes_per_transition[graph_idx][best_cpu_index], actionable_nodes_per_transition[graph_idx][best_gpu_index]][allocation] action.label = allocation best_reward = max_cpu_reward elif max_cpu_reward > max_gpu_reward: action.node_idx = actionable_nodes_per_transition[graph_idx][best_cpu_index] action.label = 0 best_reward = max_cpu_reward elif max_cpu_reward < max_gpu_reward: action.node_idx = actionable_nodes_per_transition[graph_idx][best_gpu_index] action.label = 1 best_reward = max_gpu_reward best_future_action_per_transition.append(action) best_future_reward_per_transition.append(best_reward) num_nodes_analyzed += len(actionable_nodes_per_transition[graph_idx]) return best_future_action_per_transition, best_future_reward_per_transition
def construct_action(self, action_num): if self.demand_curve_shape == DemandCurveShape.RECIPROCAL: indices = np.unravel_index(action_num, (POSSIBLE_UNITS_PERSON.shape[0], POSSIBLE_PRICES_PERSON.shape[0], POSSIBLE_RECIP_DEMAND_PARAMS_PERSON.shape[0])) units = POSSIBLE_UNITS_PERSON[indices[0]] price = POSSIBLE_PRICES_PERSON[indices[1]] demand_curve = reciprocal(POSSIBLE_RECIP_DEMAND_PARAMS_PERSON[indices[2]])(NUM_GOODS_MAX_BUY) elif self.demand_curve_shape == DemandCurveShape.LINEAR: indices = np.unravel_index(action_num, (POSSIBLE_UNITS_PERSON.shape[0], POSSIBLE_PRICES_PERSON.shape[0], POSSIBLE_LIN_DEMAND_PARAMS_PERSON.shape[0])) units = POSSIBLE_UNITS_PERSON[indices[0]] price = POSSIBLE_PRICES_PERSON[indices[1]] demand_curve = linear(POSSIBLE_LIN_DEMAND_PARAMS_PERSON[indices[2]])(NUM_GOODS_MAX_BUY) return Action(price, units, demand_curve)
def get_action(self, model): if self.rltype == RLType.TRIVIAL: demand_curve = np.array([20]) return Action(10, self.num_hours_to_work, demand_curve) if self.rltype == RLType.REINFORCE: # The state should be the model environment (Model) state_input = self.deconstruct_state(model) action_num = self.policy_net.choose_action(state_input) return self.construct_action(action_num) if self.rltype == RLType.Q_ACTOR_CRITIC: state_input = self.deconstruct_state(model) action_num = self.actor_critic.choose_action(state_input) return self.construct_action(action_num)
def __init__(self, map_file_name, curiosity=True): self.env = Environment(map_file_name) self.cat = Cat(("orange", (255, 165, 0)), 0, 0) self.mouse = Mouse(("gray", (128, 128, 128)), 0, 0) self.cheese = Cheese(("yellow", (255, 255, 0)), 0, 0) self.init_agents_position() self.action = Action() self.feed = 0 self.eaten = 0 self.age = 0 self.ai = AI() pygame.init() self.size = 40 self.screen = None self.activated = False self.curiosity = curiosity
def scoring(y_real, y_predicted): return sum(y_predicted)[-1] / (len(y_predicted) - 1) for i in range(10): print('#' * 80) print(f'# GENERATION {i + 1}') print('#' * 80) x = np.array(walker.state_history[:-1]) y = np.array([ list(a) + [r] for a, r in zip(walker.action_history, walker.reward_history) ]) walker.save_history(f'sillywalker{i+1}') model = TPOTRegressor(generations=5, population_size=20, scoring=scoring, verbosity=2, config_dict=regressor_config_dict_light) model.fit(x, y) for _ in range(10): while not walker.done: s = walker.state prediction = model.predict(np.array([s]))[0] print(prediction) action = Action(*prediction[:-1]) walker.step(action) walker.reset()
import numpy as np from neupy import layers, storage, algorithms from neupy.exceptions import StopTraining from agent import SillyWalker, Action, create_net net = create_net() storage.load( net, 'nets/net', ) walker = SillyWalker() while not walker.done: s = np.array([list(walker.state) + [1]]) prediction = net.predict(s)[0] walker.step(Action(*prediction)) walker._env.render()
def __init__(self, agent, w=0, h=0, nb_trashes=0): ''' Initialize the environment :param agent: the agent to add in the environment :param w: width of the environment (not including walls) :param h: height of the environment (not including walls) :param nb_trashes: number of trashes in the environment ''' self.width = self.default_parameters[ 'width'] if w == 0 else w # setting width self.height = self.default_parameters[ 'height'] if h == 0 else h # setting height self.obstacles = self.default_parameters[ 'obstacles'] # set the obstacles # stuffs related to the Agent (action space, state space, the agent itself) self.action_space_n = Action.size() # cardinality of action space self.state_space_n = (self.width + 1) * ( self.height + 1) # cardinality of action space : Position of agent self.agent = agent # add the agent to the environment # start throwing trashes around to get the agent a job self.nb_trashes = self.default_parameters[ 'nb_trashes'] if nb_trashes == 0 else nb_trashes self.trashes = [] # all positions of trashes i = 0 random.seed( self.nb_trashes ) # to ensure that every time the random will return the same sequence while i < self.nb_trashes: x = random.randint(1, self.width) y = random.randint(1, self.height) # if newly generated position is not that of another trash / an obstacle / the initial position of the agent if (x, y) not in self.trashes and ( x, y) not in self.obstacles and (x, y) != agent.position: self.trashes.append((x, y)) i += 1 # for conversion between position and tile # # this will help when using Q_table # self.pairs = [(i, j) for i in range(self.width + 1) for j in range(self.height + 1)] self.fig = figure(figsize=(7, 7)) self.ax = self.fig.add_subplot(1, 1, 1) self.xticks = np.arange(-0.5, self.width + 0.5, 1) self.yticks = np.arange(-0.5, self.height + 0.5, 1) self.ax.grid() self.ax.set_xticks(self.xticks) self.ax.set_yticks(self.yticks) self.ax.plot(np.array(self.trashes)[:, 0], np.array(self.trashes)[:, 1], "co", markersize=30, alpha=0.2) self.ax.plot(np.array(self.obstacles)[:, 0], np.array(self.obstacles)[:, 1], "ks", markersize=30, alpha=0.4)