def showDomain(self, a=0): s = self.state # Draw the environment if self.domain_fig is None: self.move_fig = plt.subplot(111) s = s.reshape((self.BOARD_SIZE, self.BOARD_SIZE)) self.domain_fig = plt.imshow(s, cmap='FlipBoard', interpolation='nearest', vmin=0, vmax=1) plt.xticks(np.arange(self.BOARD_SIZE), fontsize=FONTSIZE) plt.yticks(np.arange(self.BOARD_SIZE), fontsize=FONTSIZE) # pl.tight_layout() a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) self.move_fig = self.move_fig.plot(a_col, a_row, 'kx', markersize=30.0) plt.show() a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) self.move_fig.pop(0).remove() # print a_row,a_col # Instead of '>' you can use 'D', 'o' self.move_fig = plt.plot(a_col, a_row, 'kx', markersize=30.0) s = s.reshape((self.BOARD_SIZE, self.BOARD_SIZE)) self.domain_fig.set_data(s) plt.draw()
def showDomain(self, a=0): s = self.state # Draw the environment if self.domain_fig is None: self.move_fig = plt.subplot(111) s = s.reshape((self.BOARD_SIZE, self.BOARD_SIZE)) self.domain_fig = plt.imshow( s, cmap='FlipBoard', interpolation='nearest', vmin=0, vmax=1) plt.xticks(np.arange(self.BOARD_SIZE), fontsize=FONTSIZE) plt.yticks(np.arange(self.BOARD_SIZE), fontsize=FONTSIZE) # pl.tight_layout() a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) self.move_fig = self.move_fig.plot( a_col, a_row, 'kx', markersize=30.0) plt.show() a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) self.move_fig.pop(0).remove() # print a_row,a_col # Instead of '>' you can use 'D', 'o' self.move_fig = plt.plot(a_col, a_row, 'kx', markersize=30.0) s = s.reshape((self.BOARD_SIZE, self.BOARD_SIZE)) self.domain_fig.set_data(s) plt.draw()
def step(self, a): x, y, speed, heading = self.state # Map a number between [0,8] to a pair. The first element is # acceleration direction. The second one is the indicator for the wheel acc, turn = id2vec(a, [3, 3]) acc -= 1 # Mapping acc to [-1, 0 1] turn -= 1 # Mapping turn to [-1, 0 1] # Calculate next state nx = x + speed * np.cos(heading) * self.delta_t ny = y + speed * np.sin(heading) * self.delta_t nspeed = speed + acc * self.ACCELERATION * self.delta_t nheading = heading + speed / self.CAR_LENGTH * \ np.tan(turn * self.TURN_ANGLE) * self.delta_t # Bound values nx = bound(nx, self.XMIN, self.XMAX) ny = bound(ny, self.YMIN, self.YMAX) nspeed = bound(nspeed, self.SPEEDMIN, self.SPEEDMAX) nheading = wrap(nheading, self.HEADINGMIN, self.HEADINGMAX) # Collision to wall => set the speed to zero if nx == self.XMIN or nx == self.XMAX or ny == self.YMIN or ny == self.YMAX: nspeed = 0 ns = np.array([nx, ny, nspeed, nheading]) self.state = ns.copy() terminal = self.isTerminal() r = self.GOAL_REWARD if terminal else self.STEP_REWARD return r, ns, terminal, self.possibleActions()
def step(self, a): x, y, speed, heading = self.state # Map a number between [0,8] to a pair. The first element is # acceleration direction. The second one is the indicator for the wheel acc, turn = id2vec(a, [3, 3]) acc -= 1 # Mapping acc to [-1, 0 1] turn -= 1 # Mapping turn to [-1, 0 1] # Calculate next state nx = x + speed * np.cos(heading) * self.delta_t ny = y + speed * np.sin(heading) * self.delta_t nspeed = speed + acc * self.ACCELERATION * self.delta_t nheading = heading + speed / self.CAR_LENGTH * np.tan(turn * self.TURN_ANGLE) * self.delta_t # Bound values nx = bound(nx, self.XMIN, self.XMAX) ny = bound(ny, self.YMIN, self.YMAX) nspeed = bound(nspeed, self.SPEEDMIN, self.SPEEDMAX) nheading = wrap(nheading, self.HEADINGMIN, self.HEADINGMAX) # Collision to wall => set the speed to zero if nx == self.XMIN or nx == self.XMAX or ny == self.YMIN or ny == self.YMAX: nspeed = 0 ns = np.array([nx, ny, nspeed, nheading]) self.state = ns.copy() terminal = self.isTerminal() r = self.GOAL_REWARD if terminal else self.STEP_REWARD return r, ns, terminal, self.possibleActions()
def expectedStep(self, s, a): # Returns k possible outcomes # p: k-by-1 probability of each transition # r: k-by-1 rewards # ns: k-by-|s| next state # t: k-by-1 terminal values [A, B] = id2vec(a, [self.blocks, self.blocks]) # Nominal Move: ns1 = s.copy() ns1[A] = B # A is on top of B now. terminal1 = self.isTerminal(ns1) r1 = self.GOAL_REWARD if terminal1 else self.STEP_REWARD if self.destination_is_table(A, B): p = np.array([1]).reshape((1, -1)) r = np.array([r1]).reshape((1, -1)) ns = np.array([ns1]).reshape((1, -1)) t = np.array([terminal1]).reshape((1, -1)) return p, r, ns, t else: # consider dropping the block ns2 = s.copy() ns2[A] = A # Drop on table terminal2 = self.isTerminal(ns2) r2 = self.GOAL_REWARD if terminal2 else self.STEP_REWARD p = np.array([1 - self.noise, self.noise]).reshape((2, 1)) r = np.array([r1, r2]).reshape((2, 1)) ns = np.array([[ns1], [ns2]]).reshape((2, -1)) t = np.array([terminal1, terminal2]).reshape((2, -1)) return p, r, ns, t
def step(self, a): s = self.state # move block A on top of B [A, B] = id2vec(a, [self.blocks, self.blocks]) # print 'taking action %d->%d' % (A,B) if not self.validAction(s, A, B): print('State:%s, Invalid move from %d to %d' % (str(s), A, B)) print(self.possibleActions()) print(id2vec(self.possibleActions(), [self.blocks, self.blocks])) if self.random_state.random_sample() < self.noise: B = A # Drop on Table ns = s.copy() ns[A] = B # A is on top of B now. self.state = ns.copy() terminal = self.isTerminal() r = self.GOAL_REWARD if terminal else self.STEP_REWARD return r, ns, terminal, self.possibleActions()
def step(self, a): s = self.state # move block A on top of B [A, B] = id2vec(a, [self.blocks, self.blocks]) # print 'taking action %d->%d' % (A,B) if not self.validAction(s, A, B): print 'State:%s, Invalid move from %d to %d' % (str(s), A, B) print self.possibleActions() print id2vec(self.possibleActions(), [self.blocks, self.blocks]) if self.random_state.random_sample() < self.noise: B = A # Drop on Table ns = s.copy() ns[A] = B # A is on top of B now. self.state = ns.copy() terminal = self.isTerminal() r = self.GOAL_REWARD if terminal else self.STEP_REWARD return r, ns, terminal, self.possibleActions()
def step(self, a): """ Move all intruders according to the :py:meth:`~rlpy.Domains.IntruderMonitoring.IntruderPolicy`, default uniform random action. Move all agents according to the selected action ``a``. Calculate the reward = Number of danger zones being violated by intruders while no agents are present (ie, intruder occupies a danger cell with no agent simultaneously occupying the cell). """ s = self.state # Move all agents based on the taken action agents = np.array(s[:self.NUMBER_OF_AGENTS * 2].reshape(-1, 2)) actions = id2vec(a, self.ACTION_LIMITS) actions = self.ACTIONS_PER_AGENT[actions] agents += actions # Generate actions for each intruder based on the function # IntruderPolicy() intruders = np.array(s[self.NUMBER_OF_AGENTS * 2:].reshape(-1, 2)) actions = [ self.IntruderPolicy(intruders[i]) for i in xrange(self.NUMBER_OF_INTRUDERS) ] actions = self.ACTIONS_PER_AGENT[actions] intruders += actions # Put all info in one big vector ns = np.hstack((agents.ravel(), intruders.ravel())) # Saturate states so that if actions forced agents to move out of the # grid world they bound back ns = bound_vec(ns, self.discrete_statespace_limits) # Find agents and intruders after saturation agents = ns[:self.NUMBER_OF_AGENTS * 2].reshape(-1, 2) intruders = ns[self.NUMBER_OF_AGENTS * 2:].reshape(-1, 2) # Reward Calculation map = np.zeros((self.ROWS, self.COLS), 'bool') map[intruders[:, 0], intruders[:, 1]] = True map[agents[:, 0], agents[:, 1]] = False intrusion_counter = np.count_nonzero( map[self.danger_zone_locations[:, 0], self.danger_zone_locations[:, 1]]) r = intrusion_counter * self.INTRUSION_PENALTY ns = bound_vec(ns, self.discrete_statespace_limits) # print s, id2vec(a,self.ACTION_LIMITS), ns self.state = ns.copy() return r, ns, False, self.possibleActions()
def printDomain(self, s, a): print '--------------' for i in xrange(0, self.NUMBER_OF_AGENTS): s_a = s[i * 2:i * 2 + 2] aa = id2vec(a, self.ACTION_LIMITS) # print 'Agent {} X: {} Y: {}'.format(i,s_a[0],s_a[1]) print 'Agent {} Location: {} Action {}'.format(i, s_a, aa) offset = 2 * self.NUMBER_OF_AGENTS for i in xrange(0, self.NUMBER_OF_INTRUDERS): s_i = s[offset + i * 2:offset + i * 2 + 2] # print 'Intruder {} X: {} Y: {}'.format(i,s_i[0],s_i[1]) print 'Intruder', s_i r, ns, terminal = self.step(s, a) print 'Reward ', r
def step(self, a): """ Move all intruders according to the :py:meth:`~rlpy.Domains.IntruderMonitoring.IntruderPolicy`, default uniform random action. Move all agents according to the selected action ``a``. Calculate the reward = Number of danger zones being violated by intruders while no agents are present (ie, intruder occupies a danger cell with no agent simultaneously occupying the cell). """ s = self.state # Move all agents based on the taken action agents = np.array(s[:self.NUMBER_OF_AGENTS * 2].reshape(-1, 2)) actions = id2vec(a, self.ACTION_LIMITS) actions = self.ACTIONS_PER_AGENT[actions] agents += actions # Generate actions for each intruder based on the function # IntruderPolicy() intruders = np.array(s[self.NUMBER_OF_AGENTS * 2:].reshape(-1, 2)) actions = [self.IntruderPolicy(intruders[i]) for i in xrange(self.NUMBER_OF_INTRUDERS)] actions = self.ACTIONS_PER_AGENT[actions] intruders += actions # Put all info in one big vector ns = np.hstack((agents.ravel(), intruders.ravel())) # Saturate states so that if actions forced agents to move out of the # grid world they bound back ns = bound_vec(ns, self.discrete_statespace_limits) # Find agents and intruders after saturation agents = ns[:self.NUMBER_OF_AGENTS * 2].reshape(-1, 2) intruders = ns[self.NUMBER_OF_AGENTS * 2:].reshape(-1, 2) # Reward Calculation map = np.zeros((self.ROWS, self.COLS), 'bool') map[intruders[:, 0], intruders[:, 1]] = True map[agents[:, 0], agents[:, 1]] = False intrusion_counter = np.count_nonzero( map[self.danger_zone_locations[:, 0], self.danger_zone_locations[:, 1]]) r = intrusion_counter * self.INTRUSION_PENALTY ns = bound_vec(ns, self.discrete_statespace_limits) # print s, id2vec(a,self.ACTION_LIMITS), ns self.state = ns.copy() return r, ns, False, self.possibleActions()
def step(self, a): ns = self.state.copy() ns = np.reshape(ns, (self.BOARD_SIZE, -1)) a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) # print a_row, a_col # print ns ns[a_row, :] = np.logical_not(ns[a_row,:]) ns[:, a_col] = np.logical_not(ns[:, a_col]) ns[a_row, a_col] = not ns[a_row, a_col] if self.isTerminal(): terminal = True r = 0 else: terminal = False r = self.STEP_REWARD # sleep(1) ns = ns.flatten() self.state = ns.copy() return r, ns, terminal, self.possibleActions()
def step(self, a): ns = self.state.copy() ns = np.reshape(ns, (self.BOARD_SIZE, -1)) a_row, a_col = id2vec(a, [self.BOARD_SIZE, self.BOARD_SIZE]) # print a_row, a_col # print ns ns[a_row, :] = np.logical_not(ns[a_row, :]) ns[:, a_col] = np.logical_not(ns[:, a_col]) ns[a_row, a_col] = not ns[a_row, a_col] if self.isTerminal(): terminal = True r = 0 else: terminal = False r = self.STEP_REWARD # sleep(1) ns = ns.flatten() self.state = ns.copy() return r, ns, terminal, self.possibleActions()
def step(self, a): if self.random_state.random_sample() < self.noise: # Random Move self.slips.append(self.state[:2]) a = self.random_state.choice(self.possibleActions()) # Map a number between [0,8] to a pair. The first element is # acceleration direction. The second one is the indicator for the wheel # a = 4 acc, turn = id2vec(a, [3, 3]) acc -= 1 # Mapping acc to [-1, 0 1] turn -= 1 # Mapping turn to [-1, 0 1] # Calculate next state ns = self._action_dynamics(self.state, acc, turn) self.state = ns.copy() terminal = self.isTerminal() r = self._reward(ns, terminal) return r, ns, terminal, self.possibleActions()
def stateID2state(self, s_id): """ Returns the state vector correponding to a state_id. If dimensions are continuous it returns the state representing the middle of the bin (each dimension is discretized according to ``representation.discretization``. :param s_id: The id of the state, often calculated using the ``state2bin`` function :return: The state *s* corresponding to the integer *s_id*. """ # Find the bin number on each dimension s = np.array(id2vec(s_id, self.bins_per_dim)) # Find the value corresponding to each bin number for d in xrange(self.domain.state_space_dims): s[d] = bin2state(s[d], self.bins_per_dim[d], self.domain.statespace_limits[d,:]) if len(self.domain.continuous_dims) == 0: s = s.astype(int) return s
def stateID2state(self, s_id): """ Returns the state vector correponding to a state_id. If dimensions are continuous it returns the state representing the middle of the bin (each dimension is discretized according to ``representation.discretization``. :param s_id: The id of the state, often calculated using the ``state2bin`` function :return: The state *s* corresponding to the integer *s_id*. """ # Find the bin number on each dimension s = np.array(id2vec(s_id, self.bins_per_dim)) # Find the value corresponding to each bin number for d in range(self.domain.state_space_dims): s[d] = bin2state(s[d], self.bins_per_dim[d], self.domain.statespace_limits[d, :]) if len(self.domain.continuous_dims) == 0: s = s.astype(int) return s
def step(self, a): # Note below that we pass the structure by reference to save time; ie, # components of sStruct refer directly to s ns = self.state.copy() sStruct = self.state2Struct(self.state) nsStruct = self.state2Struct(ns) # Subtract 1 below to give -1,0,1, easily sum actions # returns list of form [0,1,0,2] corresponding to action of each uav actionVector = np.array(id2vec(a, self.LIMITS)) nsStruct.locations += (actionVector - 1) # TODO - incorporate cost graph as in matlab. fuelBurnedBool = [(actionVector[i] == UAVAction.LOITER and (nsStruct.locations[i] == UAVLocation.REFUEL or nsStruct.locations[i] == UAVLocation.BASE)) for i in xrange(self.NUM_UAV)] fuelBurnedBool = np.array(fuelBurnedBool) == 0. nsStruct.fuel = np.array([ sStruct.fuel[i] - self.NOM_FUEL_BURN * fuelBurnedBool[i] for i in xrange(self.NUM_UAV) ]) # if fuel self.fuelUnitsBurned = np.sum(fuelBurnedBool) distanceTraveled = np.sum( np.logical_and(nsStruct.locations, sStruct.locations)) # Actuator failure transition randomFails = np.array([ self.random_state.random_sample() for dummy in xrange(self.NUM_UAV) ]) randomFails = randomFails > self.P_ACT_FAIL nsStruct.actuator = np.logical_and(sStruct.actuator, randomFails) # Sensor failure transition randomFails = np.array([ self.random_state.random_sample() for dummy in xrange(self.NUM_UAV) ]) randomFails = randomFails > self.P_SENSOR_FAIL nsStruct.sensor = np.logical_and(sStruct.sensor, randomFails) # Refuel those in refuel node refuelIndices = np.nonzero( np.logical_and(sStruct.locations == UAVLocation.REFUEL, nsStruct.locations == UAVLocation.REFUEL)) nsStruct.fuel[refuelIndices] = self.FULL_FUEL # Fix sensors and motors in base state baseIndices = np.nonzero( np.logical_and(sStruct.locations == UAVLocation.BASE, nsStruct.locations == UAVLocation.BASE)) nsStruct.actuator[baseIndices] = ActuatorState.RUNNING nsStruct.sensor[baseIndices] = SensorState.RUNNING # Test if have communication self.isCommStatesCovered = any(sStruct.locations == UAVLocation.COMMS) surveillanceBool = (sStruct.locations == UAVLocation.SURVEIL) self.numHealthySurveil = sum( np.logical_and(surveillanceBool, sStruct.sensor)) totalStepReward = 0 ns = self.struct2State(nsStruct) self.state = ns.copy() ##### Compute reward ##### if self.isCommStatesCovered: totalStepReward += self.SURVEIL_REWARD * \ min(self.NUM_TARGET, self.numHealthySurveil) if self.isTerminal(): totalStepReward += self.CRASH_REWARD totalStepReward += self.FUEL_BURN_REWARD_COEFF * self.fuelUnitsBurned + \ self.MOVE_REWARD_COEFF * \ distanceTraveled # Presently movement penalty is set to 0 return totalStepReward, ns, self.isTerminal(), self.possibleActions()
def step(self, a): # Note below that we pass the structure by reference to save time; ie, # components of sStruct refer directly to s ns = self.state.copy() sStruct = self.state2Struct(self.state) nsStruct = self.state2Struct(ns) # Subtract 1 below to give -1,0,1, easily sum actions # returns list of form [0,1,0,2] corresponding to action of each uav actionVector = np.array(id2vec(a, self.LIMITS)) nsStruct.locations += (actionVector - 1) # TODO - incorporate cost graph as in matlab. fuelBurnedBool = [(actionVector[i] == UAVAction.LOITER and (nsStruct.locations[i] == UAVLocation.REFUEL or nsStruct.locations[i] == UAVLocation.BASE)) for i in range(self.NUM_UAV)] fuelBurnedBool = np.array(fuelBurnedBool) == 0. nsStruct.fuel = np.array([sStruct.fuel[i] - self.NOM_FUEL_BURN * fuelBurnedBool[i] for i in range(self.NUM_UAV)]) # if fuel self.fuelUnitsBurned = np.sum(fuelBurnedBool) distanceTraveled = np.sum( np.logical_and( nsStruct.locations, sStruct.locations)) # Actuator failure transition randomFails = np.array([self.random_state.random_sample() for dummy in range(self.NUM_UAV)]) randomFails = randomFails > self.P_ACT_FAIL nsStruct.actuator = np.logical_and(sStruct.actuator, randomFails) # Sensor failure transition randomFails = np.array([self.random_state.random_sample() for dummy in range(self.NUM_UAV)]) randomFails = randomFails > self.P_SENSOR_FAIL nsStruct.sensor = np.logical_and(sStruct.sensor, randomFails) # Refuel those in refuel node refuelIndices = np.nonzero( np.logical_and( sStruct.locations == UAVLocation.REFUEL, nsStruct.locations == UAVLocation.REFUEL)) nsStruct.fuel[refuelIndices] = self.FULL_FUEL # Fix sensors and motors in base state baseIndices = np.nonzero( np.logical_and( sStruct.locations == UAVLocation.BASE, nsStruct.locations == UAVLocation.BASE)) nsStruct.actuator[baseIndices] = ActuatorState.RUNNING nsStruct.sensor[baseIndices] = SensorState.RUNNING # Test if have communication self.isCommStatesCovered = any(sStruct.locations == UAVLocation.COMMS) surveillanceBool = (sStruct.locations == UAVLocation.SURVEIL) self.numHealthySurveil = sum( np.logical_and(surveillanceBool, sStruct.sensor)) totalStepReward = 0 ns = self.struct2State(nsStruct) self.state = ns.copy() ##### Compute reward ##### if self.isCommStatesCovered: totalStepReward += self.SURVEIL_REWARD * \ min(self.NUM_TARGET, self.numHealthySurveil) if self.isTerminal(): totalStepReward += self.CRASH_REWARD totalStepReward += self.FUEL_BURN_REWARD_COEFF * self.fuelUnitsBurned + \ self.MOVE_REWARD_COEFF * \ distanceTraveled # Presently movement penalty is set to 0 return totalStepReward, ns, self.isTerminal(), self.possibleActions()