def __init__(self): """Setup ROS things""" rospy.init_node('robot') self.simComplete_publisher = rospy.Publisher( "/map_node/sim_complete", Bool, queue_size = 10 ) rospy.sleep(1) #call Astar here #Astar() #call MDP here Mdp() rospy.sleep(1) self.simComplete_publisher.publish(True) rospy.sleep(1) rospy.signal_shutdown("shutting down")
def __init__(self, original_mdp, product_sta, product_lab, policy_tra, probs_vect, progs_vect, exp_times_vect): Mdp.__init__(self) self.original_mdp=original_mdp self.n_state_vars=original_mdp.n_state_vars+1 self.state_vars=list(original_mdp.state_vars) self.state_vars.append("_da") self.state_vars_range=dict(original_mdp.state_vars_range) #ranges for the state vars self.initial_state=dict(original_mdp.initial_state) #dict indexed by the state vars self.n_props=original_mdp.n_props #number of propositional labels self.props=list(original_mdp.props) self.props_def=dict(original_mdp.props_def) #dict of MdpPropDef instances. keys are the propositional labels names self.n_actions=original_mdp.n_actions #number of actions self.actions=list(original_mdp.actions) #list of action names self.transitions=[] #list of MdpTransitionDef instances: won't be filled for the policy, as we will work with the flat representations self.reward_names=list(original_mdp.reward_names) #read sta product file to get flat state descriptions and number of dfa states self.n_aut_states=0 self.n_flat_states=0 self.flat_state_defs={} self.read_prod_state_file(product_sta) self.state_vars_range["_da"]=[0, self.n_aut_states] #read lab product file to get initial and accepting states self.initial_flat_state=-1 self.acc_flat_states=set() self.set_init_and_acc_states(product_lab) self.current_flat_state = None self.current_state_def = None self.set_current_state(self.initial_flat_state) self.flat_state_policy={} #self.flat_state_policy[flat_state]=action to execute in flat_state self.flat_state_sucs={} #self.flat_state_sucs[flat_state]=list of possible flat state successors, e.g., [20,25] self.flat_state_suc_probs={} #self.flat_state_probs[flat_state]=list of probs associated to the corresponding flat_state_sucs, e.g., [0.7,0.3] self.transitions=[] #not being set for efficiency. The flat representations above are easier to build and to use for execution. only needed for exporting of the policy self.set_policy_flat(policy_tra) self.guarantees_probs=self.read_vect(probs_vect) self.guarantees_progs=self.read_vect(progs_vect) self.guarantees_times=self.read_vect(exp_times_vect)
def main(): mdp = Mdp() policy_value = PolicyValue(mdp) policy_value.iterate_policy(mdp) print 'value:' for i in xrange(1, 6): print '%d:%f\t' % (i, policy_value.v[i]) print '' for i in xrange(1, 6): print policy_value.pi[i]
def main(): mdp = Mdp() policy_value = PolicyValue(mdp) policy_value.iterate_value(mdp) print "value:" for i in xrange(1, 6): print "%d:%f\t" % (i, policy_value.v[i]), print "" print "policy:" for i in xrange(1, 6): print "%d->%s\t" % (i, policy_value.pi[i]), print ""
def compute_random_pi_state_value(): value = [0.0 for r in xrange(9)] #大数模拟求均值 num = 100000 for k in xrange(1,num): for i in xrange(1,6): mdp = Mdp() s = i is_terminal = False gamma = 1.0 v = 0.0 while False == is_terminal: a = random_pi() is_terminal,s,r = mdp.transform(s,a) v += gamma * r gamma *= mdp.gamma value[i] = (value[i] * (k-1) + v) / k if k % 10000 == 0: print value[1:9] print value[1:9]
# -*- coding: utf-8 -*- """ Created on Wed Mar 29 10:56:37 2017 @author: Administrator """ from mdp import Mdp mdp = Mdp() states = mdp.get_states() actions = mdp.get_actions() gamma = mdp.get_gamma() def mc(gamma, state_sample, action_sample, reward_sample): vfunc = dict() nfunc = dict() for state in states: vfunc[state] = 0.0 nfunc[state] = 0.0 for i in xrange(len(state_sample)): G = 0.0 for step in xrange(len(state_sample[i]) - 1, -1, -1): G *= gamma G += reward_sample[i][step] for step in xrange(len(state_sample[i])): s = state_sample[i][step] vfunc[s] += G nfunc[s] += 1.0
def __init__(self, width, height, hit=False, walls=[], action_list=[], nb_actions=4, gamma=0.9, timeout=50, start_states=[0], terminal_states=[]): # width, height : int numbers defining the maze attributes # walls : list of the states that represent walls in our maze environment # action_list : list of possible actions # nb_actions : used when action_list is empty, by default there are 4 of them (go north, south, eat or west) # gamma : the discount factor of our mdp # timeout : defines the length of an episode (max timestep) --see done() function # start_states : list that defines the states where the agent can be at the beginning of an episode # terminal_states : list that defines the states corresponding to the end of an episode # (agent reaches a terminal state) --cf. done() function self.width = width self.height = height self.cells = np.zeros((width, height), int) self.walls = walls self.size = width * height state = 0 cell = 0 self.terminal_states = terminal_states self.state_width = [] self.state_height = [] # ##################### State Space ###################### for i in range(width): for j in range(height): if cell not in walls: # or self.cells[i][j] in self.terminal_states): self.cells[i][j] = state state = state + 1 self.state_width.append(i) self.state_height.append(j) else: self.cells[i][j] = -1 cell = cell + 1 self.nb_states = state # ##################### Action Space ###################### self.action_space = SimpleActionSpace(action_list=action_list, nactions=nb_actions) # ##################### Distribution Over Initial States ###################### start_distribution = np.zeros( self.nb_states) # distribution over initial states # supposed to be uniform for state in start_states: start_distribution[state] = 1.0 / len(start_states) # ##################### Transition Matrix ###################### # a "well" state is added that only the terminal states can get into transition_matrix = np.empty( (self.nb_states + 1, self.action_space.size, self.nb_states + 1)) # Init the transition matrix transition_matrix[:, N, :] = np.zeros( (self.nb_states + 1, self.nb_states + 1)) transition_matrix[:, S, :] = np.zeros( (self.nb_states + 1, self.nb_states + 1)) transition_matrix[:, E, :] = np.zeros( (self.nb_states + 1, self.nb_states + 1)) transition_matrix[:, W, :] = np.zeros( (self.nb_states + 1, self.nb_states + 1)) for i in range(self.width): for j in range(self.height): state = self.cells[i][j] if not state == -1: # Transition Matrix when going north (no state change if highest cells or cells under a wall) if j == 0 or self.cells[i][j - 1] == -1: transition_matrix[state][N][state] = 1.0 else: # it goes up transition_matrix[state][N][self.cells[i][j - 1]] = 1.0 # Transition Matrix when going south (no state change if lowest cells or cells above a wall) if j == self.height - 1 or self.cells[i][j + 1] == -1: transition_matrix[state][S][state] = 1.0 else: # it goes down transition_matrix[state][S][self.cells[i][j + 1]] = 1.0 # Transition Matrix when going east (no state change if left cells or on the left side of a wall) if i == self.width - 1 or self.cells[i + 1][j] == -1: transition_matrix[state][E][state] = 1.0 else: # it goes left transition_matrix[state][E][self.cells[i + 1][j]] = 1.0 # Transition Matrix when going west (no state change if right cells or on the right side of a wall) if i == 0 or self.cells[i - 1][j] == -1: transition_matrix[state][W][state] = 1.0 else: # it goes right transition_matrix[state][W][self.cells[i - 1][j]] = 1.0 # Transition Matrix of terminal states well = self.nb_states # all the final states' transitions go there for s in self.terminal_states: transition_matrix[s, :, :] = 0 transition_matrix[s, :, well] = 1 if hit: reward_matrix = self.reward_hit_walls() else: reward_matrix = self.simple_reward() plotter = MazePlotter(self) # renders the environment self.mdp = Mdp(self.nb_states, self.action_space, start_distribution, transition_matrix, reward_matrix, plotter, gamma=gamma, terminal_states=terminal_states, timeout=timeout)
'hold0', # index 6 'hold1', # index 7 'hold2' ] # index 8 trading_rule = trading_rules[5] # 222 # type of reinforcement learning method transaction_cost = 0 rl1 = rlm.Rl_linear(transaction_cost, epsilon, r_t, N, M, method_type, alpha_linear, gamma, random_init) mean = 0.00 sigma = 0.01 rl2 = rlm.Rl_full_matrix(transaction_cost, epsilon, r_t, N, M, method_type, alpha_grid, gamma, random_init, mean, sigma) no_trade_reward = 0 mdp = Mdp(rl2, r_t, L, transaction_cost, no_trade_reward, trading_rule) # computes return actions = [] equity_lines = [] start = max(N, L, M) end = T_max - L for iter in range(iterations_nb): state = mdp.reset(start) for t in range(start, end): # exploration exploitation if np.random.rand() < epsilon: action_t = np.random.randint(-1, 2) else: #action_t = mdp.rl_method.next_action()
from mdp import Mdp from parser_mdp import Parser import glob files = glob.glob('DeterministicGoalState/*') + glob.glob('RandomGoalState/*') for file in files[11:12]: navigationFile = open(file) navigationFileReaded = navigationFile.read() navigationFileParsed = Parser(navigationFileReaded) states = navigationFileParsed.get_states() policy_lao, time_lao = Mdp(states).lao_star() print("LAO, " + file.split('\\')[0] + ', ' + file.split('\\')[1] + ", " + str(round(time_lao, 2))) policy_iteration, time_iteration = Mdp(states).value_iteration() print("ITER, " + file.split('\\')[0] + ', ' + file.split('\\')[1] + ", " + str(round(time_iteration, 2)))