def run(self, algo, T, **kwargs): visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget log.info('~~~ initialising ~~~') with PomdpParser(params.env_config) as ctx: # creates model and solver model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) # supply additional algo params belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': belief_points = ctx.generate_belief_points(kwargs['stepsize']) pomdp.add_configs(belief_points) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) # have fun! log.info(''' ++++++++++++++++++++++ Starting State: {} Starting Budget: {} Time Horizon: {} Max Play: {} ++++++++++++++++++++++'''.format(model.curr_state, budget, T, params.max_play)) for i in range(params.max_play): # plan, take action and receive environment feedbacks pomdp.solve(T) action = pomdp.get_action(belief) new_state, obs, reward, cost = pomdp.take_action(action) if params.snapshot and isinstance(pomdp, POMCP): # takes snapshot of belief tree before it gets updated self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) # update states belief = pomdp.update_belief(belief, action, obs) total_rewards += reward budget -= cost # print ino log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'Budget: {}'.format(budget), 'New state: {}'.format(new_state), ## 'New Belief: {}'.format(belief), '=' * 20 ])) if budget <= 0: log.info('Budget spent.') if action == 'Catch' and ('tagged' in new_state): break input("Pulsa intro para ejecutar el siguiente paso del algoritmo") log.info('{} games played. Total reward = {}'.format( i + 1, total_rewards)) return pomdp
def run(self, modo, problema, algo, T, **kwargs): steps = math.array([], float) rewards = math.array([], float) if modo == "Benchmark": c = 0 else: c = 29 while c < 30: c += 1 if modo == "Benchmark": log.info("===================== Ejecucion " + str(c) + '=====================') visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget with PomdpParser(params.env_config) as ctx: model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': belief_points = ctx.generate_belief_points( kwargs['stepsize']) pomdp.add_configs(belief_points) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) if modo != "Benchmark": log.info(''' ++++++++++++++++++++++ Estado inicial: {} Presupuesto: {} Creencia: {} Horizonte de tiempo: {} Numero de juegos maximo: {} ++++++++++++++++++++++'''.format(model.curr_state, budget, belief, T, params.max_play)) condicion_parada = False i = 0 while not condicion_parada and params.max_play > i: i += 1 pomdp.solve(T, modo) action = pomdp.get_action(belief) new_state, obs, reward, cost = pomdp.take_action(action) if problema == "Tigre": condicion_parada = action == "open-left" or action == "open-right" elif problema == "LaserTag": condicion_parada = action == "Catch" elif problema == "Recipientes": condicion_parada = action == "bebe-izq" or action == "bebe-med" or action == "bebe-der" if params.snapshot and isinstance(pomdp, POMCP): self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) belief = pomdp.update_belief(belief, action, obs) total_rewards += reward budget -= cost if modo == "Interactivo": log.info('\n'.join([ 'Accion tomada: {}'.format(action), 'Observacion: {}'.format(obs), 'Recompensa: {}'.format(reward), 'Presupuesto: {}'.format(budget), 'Nuevo estado: {}'.format(new_state), 'Nueva creencia: {}'.format(belief), 'Paso numero: {}'.format(i), '=' * 20 ])) if budget <= 0: log.info('Se ha sobrepasado el presupuesto establecido.') if params.max_play != 'inf' and params.max_play <= i: log.info( 'Se ha sobrepasado el número máximo de pasos establecido.' ) log.info('{} pasos ejecutados. Recompensa total acumulada = {}\n'. format(i, total_rewards)) steps = math.append(steps, i) rewards = math.append(rewards, total_rewards) if modo == "Benchmark": mean_steps = steps.mean() std_steps = steps.std() mean_rewards = rewards.mean() std_rewards = rewards.std() print( "#########################################################################################" ) print("# RESULTADOS DEL BENCHMARK:") print("# Valor medio pasos: ", mean_steps) print("# Desviacion tipica pasos: ", std_steps) print("# Valor medio recompensas: ", mean_rewards) print("# Desviacion tipica recompensas: ", std_rewards) print( "#########################################################################################" ) return pomdp
def replay(self, algo, T, **kwargs): visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget log.info('~~~ initialising experience replay ~~~') ## 4 experiences with PomdpParser(params.env_config) as ctx: for simulation in range(4): log.info('~~~ initialising simulation: ' + str(simulation) + '~~~') # creates model and solver model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) # supply additional algo params belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': # charging alphavec policy file # belief_points = pomdp.generate_reachable_belief_points(belief, 50) # pomdp.add_configs(belief_points) pomdp.charging_policy(params.policyfile) # pomdp.solve(T) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) total_rewards = 0 # have fun! log.info(''' ++++++++++++++++++++++ Init Belief: {} Max Play: {} ++++++++++++++++++++++'''.format(belief, params.max_play)) for i in range(params.max_play): # plan, take action and receive environment feedbacks if algo == 'pomcp': pomdp.solve(T) # take action action = pomdp.get_action(belief) # new_state, obs, reward, cost = pomdp.take_action(action) # getting exp action exp_action = self.getting_mode_from_expfile( i, simulation, pomdp) if exp_action == -1: log.info('\n'.join( ['Observation: {}'.format(obs), 'Mission ended'])) plotting.destroy() break if params.snapshot and isinstance(pomdp, POMCP): # takes snapshot of belief tree before it gets updated self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) if i == 0: plotting = AnimateBeliefPlot(belief, action, exp_action) else: plotting.update(belief, action, exp_action, obs) # getting features to play symbolic observation features = self.getting_features_from_expfile( i, simulation, pomdp) #print(features) label = self.classif_model.predict(features) #print(label) # transforming label in pomdp observation available_observations = pomdp.model.observations obs = available_observations[int(label[0])] belief = pomdp.update_belief(belief, exp_action, obs) # print loginfo log.info('\n'.join([ 'Observation: {}'.format(obs), 'POMDP would take action: {}'.format(action), 'action taken during EXPERIMENT: {}'.format( exp_action), 'New Belief: {}'.format(belief), '=' * 20 ])) return pomdp
def run(self, algo, T, **kwargs): visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget environment = params.env_config benchmark = params.benchmark log.info('~~~ initialising ~~~') with PomdpParser(params.env_config) as ctx: # creates model and solver model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) # supply additional algo params belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': belief_points = ctx.generate_belief_points(kwargs['stepsize']) pomdp.add_configs(belief_points) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) # have fun! log.info(''' ++++++++++++++++++++++ Starting State: {} Starting Budget: {} Init Belief: {} Time Horizon: {} Max Play: {} ++++++++++++++++++++++'''.format(model.curr_state, budget, belief, T, params.max_play)) for i in range(params.max_play): # plan, take action and receive environment feedbacks pomdp.solve(T) action = pomdp.get_action(belief) new_state, obs, reward, cost = pomdp.take_action(action) if params.snapshot and isinstance(pomdp, POMCP): # takes snapshot of belief tree before it gets updated self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) # update states belief = pomdp.update_belief(belief, action, obs) total_rewards += reward budget -= cost # Printing the details for every step of the interactive simulation # log.info('\n'.join([ # 'Taking action: {}'.format(action), # 'Observation: {}'.format(obs), # 'Reward: {}'.format(reward), # 'Budget: {}'.format(budget), # 'New state: {}'.format(new_state), # 'New Belief: {}'.format(belief), # '=' * 20 # ])) # Tiger problem ---------------------------------------------------------------- # When the open action is selected, the tiger problem will end, either the person scapes or is eaten by the tiger, so it has to stop. if "Tiger-2D.POMDP" in environment: if "open" in action: log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), '=' * 20 ])) break log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'New Belief: {}'.format(belief), '=' * 20 ])) # Web ads problem ---------------------------------------------------------------- # When the adv action is selected, the web ad problem will end, either the person gets a tie or a skate advertisement so it has to stop. if "Web.POMDP" in environment: if params.benchmark == 0: if "adv" in action: log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), '=' * 20 ])) break log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'New Belief: {}'.format(belief), '=' * 20 ])) # Landing problem ---------------------------------------------------------------- # When the arrive action is selected, the landing problem will end, either the tripulation finds a treasure or they die horribly to the creatures in the landing. if "Landing.POMDP" in environment: if "arrive" in action: log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), '=' * 20 ])) break log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'New Belief: {}'.format(belief), '=' * 20 ])) #Tag problem ---------------------------------------------------------------- # When the status is tagger, the robot s will catch robot t, the tag problem will end so it has to stop. if "Tag.POMDP" in environment: if params.benchmark == 0: if "tagged" in model.curr_state: log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), '=' * 20 ])) break log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'New state: {}'.format(new_state), #'New Belief: {}'.format(belief), '=' * 20 ])) # Printing the total steps and reward when the loop ends. if params.benchmark == 0: log.info( 'Simulation ended after {} steps. Total reward = {}'.format( i + 1, total_rewards)) return pomdp
def runBench(self, algo, T, **kwargs): visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget environment = params.env_config benchmark = params.benchmark log.info('~~~ Initialising simulation ~~~') with PomdpParser(params.env_config) as ctx: # creates model and solver model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) # supply additional algo params belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': belief_points = ctx.generate_belief_points(kwargs['stepsize']) pomdp.add_configs(belief_points) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) # have fun! log.info(''' ++++++++++++++++++++++ Starting State: {} Starting Budget: {} Init Belief: {} Time Horizon: {} Max Play: {} ++++++++++++++++++++++'''.format(model.curr_state, budget, belief, T, params.max_play)) for i in range(params.max_play): # plan, take action and receive environment feedbacks pomdp.solve(T) action = pomdp.get_action(belief) new_state, obs, reward, cost = pomdp.take_action(action) if params.snapshot and isinstance(pomdp, POMCP): # takes snapshot of belief tree before it gets updated self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) # update states belief = pomdp.update_belief(belief, action, obs) total_rewards += reward budget -= cost #Computing final results when a problem stops if "open" in action or "tagged" in model.curr_state or "adv" in action or "arrive" in action: log.info('Ended simulation after {} steps. Total reward = {}'. format(i + 1, total_rewards)) self.step_list.append(i + 1) self.fReward_list.append(total_rewards) self.steps += i + 1 self.fReward += total_rewards break # Printing the details for every step of the interactive simulation # log.info('\n'.join([ # 'Taking action: {}'.format(action), # 'Observation: {}'.format(obs), # 'Reward: {}'.format(reward), # 'Budget: {}'.format(budget), # 'New state: {}'.format(new_state), # 'New Belief: {}'.format(belief), # '=' * 20 # ])) if budget <= 0: log.info('Budget spent.') return pomdp
def replay(self, algo, T, **kwargs): visualiser = GraphViz(description='tmp') params, pomdp = self.params, None total_rewards, budget = 0, params.budget log.info('~~~ initialising experience replay ~~~') with PomdpParser(params.env_config) as ctx: total_rewards_simulations = [] for simulation in range(params.sim): log.info('~~~ initialising simulation: ' + str(simulation) + '~~~') # creates model and solver model = self.create_model(ctx.copy_env()) pomdp = self.create_solver(algo, model) # supply additional algo params belief = ctx.random_beliefs( ) if params.random_prior else ctx.generate_beliefs() if algo == 'pbvi': # charging alphavec policy file # belief_points = pomdp.generate_reachable_belief_points(belief, 50) # pomdp.add_configs(belief_points) pomdp.charging_policy(params.policyfile) # pomdp.solve(T) elif algo == 'pomcp': pomdp.add_configs(budget, belief, **kwargs) total_rewards = 0 # have fun! log.info(''' ++++++++++++++++++++++ Starting State: {} Init Belief: {} Max Play: {} ++++++++++++++++++++++'''.format(model.curr_state, belief, params.max_play)) for i in range(params.max_play): # plan, take action and receive environment feedbacks if algo == 'pomcp': pomdp.solve(T) action = pomdp.get_action(belief) new_state, obs, reward, cost = pomdp.take_action(action) if params.snapshot and isinstance(pomdp, POMCP): # takes snapshot of belief tree before it gets updated self.snapshot_tree(visualiser, pomdp.tree, '{}.gv'.format(i)) # update states belief = pomdp.update_belief(belief, action, obs) total_rewards += reward budget -= cost # print ino log.info('\n'.join([ 'Taking action: {}'.format(action), 'Observation: {}'.format(obs), 'Reward: {}'.format(reward), 'Budget: {}'.format(budget), 'New state: {}'.format(new_state), 'New Belief: {}'.format(belief), '=' * 20 ])) if budget <= 0: log.info('Budget spent.') log.info('{} games played. Total reward = {}'.format( i + 1, total_rewards)) total_rewards_simulations.append(total_rewards) exp_total_reward = np.mean(total_rewards_simulations) std_exp_total_reward = np.std(total_rewards_simulations) print(params.sim, 'simulations played.') print('Exp total reward = ', exp_total_reward) print('Std Exp total reward = ', std_exp_total_reward) log.info('{} simulations played. Exp total reward = {}'.format( params.sim, exp_total_reward)) log.info('Total rewards observed = {}'.format( total_rewards_simulations)) return pomdp