def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) self.q = np.random.rand(self.num_states, self.num_actions) self.returns = {(state, action):[] for state in range(self.num_states) for action in range(self.num_actions)} self.epsilon = 1 self.tuple_state_agent_met=[]
def __init__(self, params): Agent.__init__(self, params) # The 3 following args will be given by the env. self.n_x = None self.n_y = None self.states = None self.X_memory = [] self.Y_memory = [] self.ia = MLPRegressor(warm_start=True, max_iter=200, early_stopping=False, hidden_layer_sizes=(20, 10, 5), learning_rate_init=1 * 10**-3, activation='identity') self.epsilon = 0.5 self.gamma = 0.8 self.alpha = 0.5 self.last_action = None self.last_state = None self.is_action_possible = None self.n_max_action = params["max_action_per_episode"] self.first_fit = True self.key_taken = []
class Game: def __init__(self, agent1, agent1parameters, agent2, agent2parameters, game, displayPrefix = None): self.board = games.getBoard(game) self.agent1 = Agent(agent1, agent1parameters, self.board) self.agent2 = Agent(agent2, agent2parameters, self.board) self.displayPrefix = displayPrefix # Return 1 if agent1 won # Return 0 if draw # Return -1 if agent2 won def playGame(self, displayState = True): gameState = self.board.start() agent1Turn = True turn = 0 if displayState: self.board.display(gameState) while not self.board.isGameOver(gameState): if self.displayPrefix: print(self.displayPrefix + ' Turn {0}'.format(turn), end='\u001b[0K\r') turn += 1 if displayState: print('{0}\'s turn'.format(self.agent1.agentType if agent1Turn else self.agent2.agentType)) gameState = self.agent1.makeMove(gameState) if agent1Turn else self.agent2.makeMove(gameState) agent1Turn = not agent1Turn if displayState: self.board.display(gameState) if self.board.winner(gameState) == -1: return 0 return -1 if agent1Turn else 1
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) self.epsilon = 1 self.n_key_max = 5 self.alpha = 0.4 self.gamma = 0.9999 self.key_taken = [] self.q = np.random.rand(self.num_states, self.num_actions, self.n_key_max)
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer) #tableau de taille nombre d'etat * nombre d'actions possibles self.Q = np.zeros((int(params['num_cells_grid1D']),2)) #Probabilite d'exploration self.exploration = 0.05
def __init__(self, agent1, agent1parameters, agent2, agent2parameters, game, displayPrefix=None): self.board = games.getBoard(game) self.agent1 = Agent(agent1, agent1parameters, self.board) self.agent2 = Agent(agent2, agent2parameters, self.board) self.displayPrefix = displayPrefix
def train(self, Q: Agent, task: Task, epsilon: Epsilon, alpha: LearningRate, episodes, cache_train=True, test_times=1): Q.clear() epsilon.clear() alpha.clear() rewards_history = np.zeros(episodes, dtype=np.float32) steps_history = np.zeros(episodes, dtype=np.float32) episode_epsilon_history = np.zeros(episodes, dtype=np.float32) epsilon_history = [] conseq_200 = 0 self.episode = 0 for e in range(episodes): steps, rewards, epsilons = self.run_episode( Q, task, epsilon, alpha) if cache_train: returns = 0.0 for r in rewards[::-1]: returns = r + self.gamma * returns else: returns, steps = 0.0, 0.0 for _ in range(test_times): returns_, steps_ = self.evaluate(Q, task) returns += returns_ / test_times steps += steps_ / test_times rewards_history[e] = returns steps_history[e] = steps episode_epsilon_history[e] = np.mean(epsilons) epsilon_history.append(epsilons) if e % 10 == 0: print('{} {} {}'.format(episode_epsilon_history[e], returns, steps)) epsilon.update_end_of_episode(self.episode) alpha.update_end_of_episode(self.episode) self.episode += 1 if steps >= 199.99: conseq_200 += 1 else: conseq_200 = 0 # if conseq_200 >= 4: # rewards_history[e:] = rewards_history[e] # steps_history[e:] = steps_history[e] # episode_epsilon_history[e:] = episode_epsilon_history[e] # break return steps_history, rewards_history, episode_epsilon_history, \ np.concatenate(epsilon_history, axis=0)
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) #Nombre de lignes self.l = int(params['ligne']) #Nombre de colonnes self.c = int(params['colonne']) #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer) #tableau de taille nombre d'etat * nombre d'actions possibles self.Q = np.zeros((self.c * self.l, 4)) #Probabilite d'exploration self.exploration = 0.05
def train(self, Q: Agent, task: Task, policy: Policy, episodes): """ Trains the specified agent on the specified task using the specified exploration policy using the current implementation. A specified number of episodes is generated for training. inputs: Q - an Agent object storing the Q-values task - a Task object representing the task the agent is learning policy - a Policy object representing the exploration policy used to balance exploration and exploitation episodes - the number of episodes of training to perform outputs: - a one-dimensional numpy array containing the lengths of each episode - this can be used to check the learning progress of the agent - a one-dimensional numpy array containing the sum of the discounted rewards from the environment obtained on each episode - this can be used to check the learning progress of the agent """ # initialization self.clear() Q.clear() policy.clear() # for storing history of trial rewards_history = np.zeros(episodes, dtype=float) steps_history = np.zeros(episodes, dtype=int) # run episodes for e in range(episodes): # run an episode of training steps, rewards = self.run_episode(Q, task, policy) # compute the value of the backup and update the history R = 0.0 for reward in rewards[::-1]: R = reward + self.gamma * R rewards_history[e] = R steps_history[e] = steps # finish episode policy.finish_episode(e) Q.finish_episode(e) return steps_history, rewards_history
def __init__(self, params): """initialisation de l'agent""" Agent.__init__(self, params) self.l = int(params['ligne']) self.c = int(params['colonne']) #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer) #tableau de taille nombre d'etat * nombre d'actions possibles self.Q = np.zeros((self.l * self.c, 4)) #Probabilite d'exploration self.exploration = 0.05 self.alpha = 0.6 #coefficient optimises self.gamma = 1
def create_agent(agent_id, connections, config_filename): """ load config fromm file and create agent :param agent_id: agent id :param connections: connections :param config_filename: name of the file with config :return: created agent """ config = load_agent_config(config_filename) return Agent(agent_id, connections, config)
def sample_batch(self, batch_size = None): batch, self.sample_index = Agent.sample_batch(self) self.r = self.r.resize_(batch['reward'].shape).copy_(torch.Tensor(batch['reward'])) self.done = self.done.resize_(batch['done'].shape).copy_(torch.Tensor(batch['done'])) self.a = self.a.resize_(batch['action'].shape).copy_(torch.Tensor(batch['action'])) self.s = self.s.resize_(batch['state'].shape).copy_(torch.Tensor(batch['state'])) self.s_ = self.s_.resize_(batch['next_state'].shape).copy_(torch.Tensor(batch['next_state'])) self.logpac_old = self.logpac_old.resize_(batch['logpac'].shape).copy_(torch.Tensor(batch['logpac'])) self.distri = self.distri.resize_(batch['distri'].shape).copy_(torch.Tensor(batch['distri'])) self.other_data = batch['other_data'] if self.other_data: for key in self.other_data.keys(): self.other_data[key] = torch.Tensor(self.other_data[key]).type_as(self.s)
def __init__(self, _type, bandits, agent, runs, exp_nums, logdir, k_vec, k_probs): self._type = _type self.esr_vector = [] self.esr_probs = [] self.f1_score = [] self.f1 = [] #self.plotter = Plotter() self.metrics = Metrics() self.k_vec = k_vec self.k_probs = k_probs self.f1_df = pd.DataFrame() self.esrBandit = Agent(0, 10) self.logdir = logdir for i in range(len(k_vec)): self.esrBandit.manual_distribution(k_vec[i], k_probs[i]) if self._type == "bandit": self.bandits = bandits self.agent = agent self.runs = runs self.exp_nums = exp_nums
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) #Nombre de lignes self.l = int(params['ligne']) #Nombre de colonnes self.c = int(params['colonne']) #strategie initiale (au depart l'agent ne connait pas les deplacements optimaux a effectuer) #tableau de taille nombre d'etat * nombre d'actions possibles self.Q = np.zeros((self.c * self.l, 4)) #Probabilite d'exploration self.exploration = 0.05 #Coefficients pour la formule de Q self.alpha = 1 self.gamma = 1 #vitesse de convergence initialisee (etape a partir de laquelle les recompenses totales sont superieures a 60 self.vitesse = 0 #numero de l'episode qui vient de finir self.ep = 0
def act(self, Q: Agent, task: Task, state): # set the number of actions of the current task, if not set if self.valid_actions == 0: self.valid_actions = task.valid_actions() # get the distribution over actions for the current state pref = self.preferences[state] # sample an action from the preference distribution action = np.random.choice(self.valid_actions, 1, p=pref) # get the greedy action according to Q greedy = Q.max_action(state) # update the preference distribution pref *= (1.0 - self.beta) pref[greedy] /= (1.0 - self.beta) pref[greedy] += self.beta * (1.0 - pref[greedy]) return action
def cuda(self): Agent.cuda(self) self.e_NAF = self.e_NAF.cuda() self.t_NAF = self.t_NAF.cuda()
class Experiment(): def __init__(self, _type, bandits, agent, runs, exp_nums, logdir, k_vec, k_probs): self._type = _type self.esr_vector = [] self.esr_probs = [] self.f1_score = [] self.f1 = [] #self.plotter = Plotter() self.metrics = Metrics() self.k_vec = k_vec self.k_probs = k_probs self.f1_df = pd.DataFrame() self.esrBandit = Agent(0, 10) self.logdir = logdir for i in range(len(k_vec)): self.esrBandit.manual_distribution(k_vec[i], k_probs[i]) if self._type == "bandit": self.bandits = bandits self.agent = agent self.runs = runs self.exp_nums = exp_nums def run(self): if self._type == "bandit": avg_log = self.logdir + 'average' + '/' if not os.path.exists(avg_log): os.makedirs(avg_log, exist_ok=True) for run in range(self.runs): self.run_df = pd.DataFrame() start = time.perf_counter() run_log = self.logdir + 'run_' + str(run + 1) + '/' if not os.path.exists(run_log): os.makedirs(run_log, exist_ok=True) for i in range(self.exp_nums): self.esr_vector = [] self.esr_probs = [] if i == 0: for j in range(len(self.bandits)): _return_ = self.bandits[j].pull_arm() self.agent.update(j, _return_) action = self.agent.select_action() _return_ = self.bandits[action].pull_arm() self.agent.update(action, _return_) esr_index = self.agent.esr_dominance() self.esr_agent = deepcopy(self.agent) self.esr_agent.distribution = np.array( self.esr_agent.distribution)[esr_index] for val in esr_index: self.esr_vector.append( self.agent.distribution[val].get_distribution()[0]) self.esr_probs.append( self.agent.distribution[val].get_distribution()[1]) #self.f1.append(self.metrics.precision_recall(self.esr_vector, self.esr_probs, self.k_vec, self.k_probs)) self.f1.append( self.metrics.pr_kl(self.esr_agent, self.esrBandit)) self.run_df['run' + str(run)] = self.f1 self.run_df['mean'] = self.run_df.mean(axis=1) self.f1_df['run' + str(run)] = self.f1 end = time.perf_counter() #self.run_df['average'] = self.f1_df.mean(axis=1) #print(self.f1_df) #self.f1_score = self.f1_df['Average'] #self.run_df['average'] = np.mean(np.array(self.f1_score).reshape(-1, 10), axis=1) self.run_df.to_csv(run_log + "/f1_score.csv", index=False) ser = SER(self.k_vec, self.k_probs) ser_expectations = ser.expectations() ser_pareto_front = ser.pareto_front() print("") print( '**** Run ' + str(run + 1) + ' - Execution Time: ' + str(round((end - start), 2)) + ' seconds ****', ) print(str(len(esr_index)) + " distributions in the ESR set") print("ESR Vector and Probabilities") for a in range(len(self.esr_vector)): print(self.esr_vector[a]) print(self.esr_probs[a]) print(" ") print("") print("SER - Pareto Front") print("Number of policies on the pareto front : " + str(len(ser_pareto_front))) print(ser_pareto_front) print("") self.plotter = Plotter(self.esr_vector, self.esr_probs, run_log, self.exp_nums, True, True) self.plotter.plot_run() self.f1_df['mean'] = self.f1_df.mean(axis=1) #self.f1_df['average'] = np.mean(np.array(self.f1_df['mean']).reshape(-1, 10), axis=1) self.f1_df.to_csv(avg_log + "/f1_score.csv", index=False) self.plotter = Plotter(self.esr_vector, self.esr_probs, avg_log, self.exp_nums, True, True) self.plotter.plot_run() return
def cuda(self): Agent.cuda(self) self.policy = self.policy.cuda() if self.value_type is not None: self.value = self.value.cuda()
graph.create_network() disable_spade_warnings() f = open('log2.csv', 'w+') logger.initialize_default_logger(f) conf1 = AgentConfig() conf2 = AgentConfig() conf1.initial_resource = 100 conf1.storage_limit = 100 conf1._policy_builder = SimplePolicy conf1._policy_builder_args = [0.1, 2, 2, 0.01, 0.01] conf2.initial_money = 10 conf2.needs_satisfaction_timeout = 5 conf2._needs = random_uniform conf2._needs_args = [3, 4] conf2.needs_satisfaction_cost = 0.5 conf2._policy_builder = SimplePolicy conf2._policy_builder_args = [0.1, 2, 2, 0.01, 0.01] if __name__ == '__main__': a1 = Agent(1, [2], conf1) a2 = Agent(2, [1], conf2) a1.start() a2.start() visualisation(graph, logger) time.sleep(10)
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params)
def distribution(self, Q: Agent, task: Task, state): values = Q.values(state) values = np.exp(values / self.temp) values /= np.sum(values) return values
def create_system(self): for i in range(self.num_agents): self.dict_agents[i] = Agent(self.num_obs, self.num_messages, self.num_actions, self.epsilon, self.learning_rate, i, self.T)
def cuda(self): Agent.cuda(self) self.e_Actor = self.e_Actor.cuda() self.e_Critic = self.e_Critic.cuda() self.t_Actor = self.t_Actor.cuda() self.t_Critic = self.t_Critic.cuda()
def __init__(self, params): """See documentation in the base class""" Agent.__init__(self, params) self.final_state = EnvironmentGrid2D(params).terminal_state
def epsilon_greedy(self, Q: Agent, task: Task, state, epsilon): if np.random.rand() <= epsilon: return random.randrange(task.valid_actions()) else: return Q.max_action(state)
def cuda(self): Agent.cuda(self) self.e_DQN = self.e_DQN.cuda() self.t_DQN = self.t_DQN.cuda()
def main(): from agents.Agent import Agent from envs.Bandit import Bandit from wrappers.Distribution import Distribution from experiments.Experiment import Experiment from plot.Plotter import Plotter from wrappers.SER import SER import argparse parser = argparse.ArgumentParser(description='') parser.add_argument('--max_val', default=10, type=int) parser.add_argument('--timesteps', default=100000, type=int) parser.add_argument('--runs', default=1, type=int) parser.add_argument('--type_bandits', default='random', type=str) args = parser.parse_args() print(args) max_val = args.max_val actions = 5 episodes = args.timesteps runs = args.runs plotter = Plotter() agent = Agent(actions, max_val) bandits = [] bandit_distributions = [] bandit_probabilities = [] #logdir = f'runs/bandit/{args.type_bandits}/' #logdir += datetime.now().strftime('%Y-%m-%d_%H-%M-%S_') + str(uuid.uuid4())[:4] + '/' if args.type_bandits == 'random': # Random Distributions #param = [True/False, number of obs in distribution, num objective, max value] bandits.append(Bandit(param = [True, 5, 2, 5])) bandits.append(Bandit(param = [True, 5, 2, 2])) bandits.append(Bandit(param = [True, 5, 2, 3])) bandits.append(Bandit(param = [True, 5, 2, 5])) bandits.append(Bandit(param = [True, 5, 2, 2])) print("*** Bandit Distributions ***") for i in range(actions): print(bandits[i].vectors) print(bandits[i].probs) print("") if args.type_bandits == 'bandit': # Manual Distributions bandits.append(Bandit([[2, 0], [2, 1]], [0.05, 0.05])) bandits.append(Bandit([[0, 0], [1, 1]], [0.1, 0.1])) bandits.append(Bandit([[1, 0], [1, 3]], [0.1, 0.1])) bandits.append(Bandit([[1, 0], [2, 1]], [0.1, 0.4])) bandits.append(Bandit([[1, 1], [1, 2]], [0.05, 0.05])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) if args.type_bandits == 'realworld': # Manual Distributions bandits.append(Bandit([[2, 0], [2, 1], [3, 2], [4, 2]], [0.05, 0.05, 0.1, 0.8])) bandits.append(Bandit([[0, 0], [1, 1], [2, 0], [2, 1]], [0.1, 0.1, 0.5, 0.3])) bandits.append(Bandit([[1, 0], [1, 3], [3, 4], [5, 4]], [0.1, 0.1, 0.2, 0.6])) bandits.append(Bandit([[1, 0], [2, 1], [3, 1], [3, 2]], [0.1, 0.4, 0.4, 0.1])) bandits.append(Bandit([[1, 1], [1, 2], [4, 0], [0, 0]], [0.05, 0.05, 0.1, 0.8])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) #bandits.append(Bandit([[1, 1], [1, 5], [2, 3], [1, 2]], [0.1, 0.2, 0.5, 0.2])) for i in range(actions): #print(bandits[i].vectors) #print(bandits[i].probs) bandit_distributions.append(bandits[i].vectors) bandit_probabilities.append(bandits[i].probs) #print(" ") esr_vectors = [] esr_probabilities = [] esr_vectors.append(bandit_distributions[0]) esr_vectors.append(bandit_distributions[2]) esr_probabilities.append(bandit_probabilities[0]) esr_probabilities.append(bandit_probabilities[2]) experiment = Experiment("bandit", bandits, agent, runs, episodes, esr_vectors, esr_probabilities) experiment.run() #plotter.multi_cdf_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_pdf_plot(experiment.esr_vector, experiment.esr_probs) #plotter.heatmap_plot([[2, 0], [2, 1], [3, 2], [4, 2]], [0.05, 0.05, 0.1, 0.8]) #plots to generate #plotter.multi_heatmap_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_cdf_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_joint_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs) #plotter.multi_3d_pdf_bar_plot(experiment.esr_vector, experiment.esr_probs) '''