def beginSimulation(self): qlearn = QLearner(self.config) while qlearn.renameThis(): print "Iterate" result_policy = qlearn.iterate() print "publish" self.resultsPolicyPub.publish(result_policy) print "Will Continue?", qlearn.renameThis() self.simulationCompletePub.publish(True) rospy.sleep(10) rospy.signal_shutdown("Simulation has Completed")
# Use tabular reinforcement learning if not args.usefunc and not args.usempc: # set up testbench mode = QLearner.ONLINE if args.online else QLearner.OFFLINE tb = TestBench(size=args.topology, seed=seed, learner=None) dmap = distance_map(tb.goals, args.topology, args.topology, flatten=True) learner = QLearner(lrate=args.rate, discount=args.discount, depth=args.maxdepth, steps=args.steps, policy=args.policy, max_prob=args.greedyprob, mode=mode, rmatrix=tb.rmatrix, tmatrix=tb.tmatrix, goal=tb.goals, seed=seed) tb.learner = learner fault(tb, args.fault) # Run simulation with intermittent faults and adaptive learning # tb.learner.learn(coverage=args.coverage) coords, traversed, final, length, goal = adaptive_path( tb, start, args.explore, fault, args.fault) if args.numtrials == 1: print_results(coords, final, length, goal) # Show optimal and adaptive paths to goal state
'Policy Iteration': gw.run_policy_iterations} for solver_name, solver_fn in mdp_solvers.items(): print('Final result of {}:'.format(solver_name)) policy_grids, utility_grids = solver_fn(iterations=25, discount=0.5) print(policy_grids[:, :, -1]) print(utility_grids[:, :, -1]) plt.figure() gw.plot_policy(utility_grids[:, :, -1]) plot_convergence(utility_grids, policy_grids) plt.show() ql = QLearner(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=0.8, discount_rate=0.9, random_action_prob=0.5, random_action_decay_rate=0.99, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 flat_policies, flat_utilities = ql.learn(start_state, gw.generate_experience, iterations=iterations) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) print('Final result of QLearning:')
random_action_decay_rate = 0.99 + random.random() / 100 gw = GridWorldMDP(reward_grid=reward_grid, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, action_probabilities=[ (-1, 0.1), (0, 0.8), (1, 0.1), ], no_action_probability=0.0) ql = QLearner(num_states=(shape[0] * shape[1]), num_actions=4, learning_rate=learning_rate, discount_rate=discount_rate, random_action_prob=random_action_prob, random_action_decay_rate=random_action_decay_rate, dyna_iterations=0) start_state = gw.grid_coordinates_to_indices(start) iterations = 1000 flat_policies, flat_utilities = ql.learn(start_state, gw.generate_experience, iterations=iterations) test_iterations = 1000 value = ql.test(start_state, gw.generate_experience, iterations=test_iterations)
gw = GridWorldMDP(reward_grid=reward_grid, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, action_probabilities=[ (-1, 0.1), (0, 0.8), (1, 0.1), ], no_action_probability=0.0, goal_mask=goal_mask, avoid_mask=avoid_mask) ql = QLearner(num_states=(reward_grid.shape[0] * reward_grid.shape[1]), num_actions=4, learning_rate=0.7, discount_rate=0.9, random_action_prob=0.2, random_action_decay_rate=0.99, dyna_iterations=0) start = (reward_grid.shape[0] - 1, 0) start_state = gw.grid_coordinates_to_indices(start) iterations = 6000 flat_policies, flat_utilities = ql.learn(start_state, gw.generate_experience, iterations=iterations) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) #print('Final result of QLearning:')
def solve(self): reward_grid = np.zeros(self.shape) + self.default_reward reward_grid[self.goal] = self.goal_reward coords = zip(*self.traps) trap_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords), shape=self.shape, dtype=bool).toarray() reward_grid[trap_mask] = self.trap_reward coords = zip(*self.obstacles) obstacle_mask = sparse.coo_matrix((np.ones(len(coords[0])), coords), shape=self.shape, dtype=bool).toarray() reward_grid[obstacle_mask] = 0 terminal_mask = np.zeros_like(reward_grid, dtype=np.bool) terminal_mask[self.goal] = True terminal_mask[trap_mask] = True gw = GridWorldMDP(start=self.start, reward_grid=reward_grid, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, action_probabilities=[ (-1, 0.1), (0, 0.8), (1, 0.1), ], no_action_probability=0.0) utility_grid = np.zeros(self.shape) gw.plot_policy( utility_grid, None, str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld') mdp_solvers = { 'Value Iteration': gw.run_value_iterations, 'Policy Iteration': gw.run_policy_iterations } time_results = [] steps_results = [] reward_results = [] for solver_name, solver_fn in mdp_solvers.items(): print('Solving {}:'.format(solver_name)) title = str(self.shape[0]) + 'x' + str( self.shape[1]) + ' Gridworld - ' + solver_name policy_grids, utility_grids, time_stamps, num_steps, total_reward = solver_fn( iterations=self.iterations[0], discount=0.5, title=title) a = np.empty(self.iterations[1] - self.iterations[0]) a.fill(time_stamps[-1]) time_stamps = np.concatenate((time_stamps, a)) time_results.append(time_stamps) a.fill(num_steps[-1]) num_steps = np.concatenate((num_steps, a)) steps_results.append(num_steps) a.fill(total_reward[-1]) total_reward = np.concatenate((total_reward, a)) reward_results.append(total_reward) #print(policy_grids[:, :, -1]) #print(utility_grids[:, :, -1]) gw.plot_policy(utility_grids[:, :, -1], None, title) plot_convergence(utility_grids, policy_grids, title) """for lr in [0.7, 0.8, 0.9]: for ra in [0.2, 0.5, 0.8]: for e in [.79, .89, .99]:""" ql = QLearner(num_states=(self.shape[0] * self.shape[1]), num_actions=4, obstacle_mask=obstacle_mask, terminal_mask=terminal_mask, learning_rate=0.8, discount_rate=0.975, random_action_prob=0.5, random_action_decay_rate=0.89, dyna_iterations=0) print('Solving QLearning:') start_state = gw.grid_coordinates_to_indices(self.start) #title = str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Q Learning - ' + str(lr).replace('.', '') + str(ra).replace('.', '') + str(e).replace('.', '') title = str(self.shape[0]) + 'x' + str( self.shape[1]) + ' Gridworld - Q Learning' iterations = self.iterations[1] flat_policies, flat_utilities, time_stamps, num_steps, total_reward = ql.learn( start_state, gw, iterations=iterations, title=str(self.shape[0]) + 'x' + str(self.shape[1]) + '/QL/' + title) new_shape = (gw.shape[0], gw.shape[1], iterations) ql_utility_grids = flat_utilities.reshape(new_shape) ql_policy_grids = flat_policies.reshape(new_shape) time_results.append(time_stamps) steps_results.append(num_steps) reward_results.append(total_reward) #print(ql_policy_grids[:, :, -1]) #print(ql_utility_grids[:, :, -1]) gw.plot_policy(ql_utility_grids[:, :, -1], ql_policy_grids[:, :, -1], title) plot_convergence(ql_utility_grids[:, :, 0:-2], ql_policy_grids[:, :, 0:-2], title) plot_time( np.array(time_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Time') plot_num_steps( np.array(steps_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - # Steps') plot_reward( np.array(reward_results), str(self.shape[0]) + 'x' + str(self.shape[1]) + ' Gridworld - Reward')