def get_best_action(world): # Choose a random cluster next_location: classes.Location = decision.neighbour_filtering.filtering_neighbours( world.state, number_of_neighbours=1, )[0] if world.state.vehicle.battery_inventory > BATTERY_INVENTORY * 0.1 else world.state.depots[ 0] if world.state.is_at_depot(): swappable_scooters_ids = [] number_of_scooters_to_swap = 0 else: # Find all scooters that can be swapped here swappable_scooters_ids = [ scooter.id for scooter in world.state.current_location.get_swappable_scooters() ] # Calculate how many scooters that can be swapped number_of_scooters_to_swap = world.state.get_max_number_of_swaps( world.state.current_location) # Return an action with no re-balancing, only scooter swapping return classes.Action( battery_swaps=swappable_scooters_ids[:number_of_scooters_to_swap], pick_ups=[], delivery_scooters=[], next_location=next_location.id, )
def test_next_state_from_action(self): value_function = decision.value_functions.LinearValueFunction( *self.value_function_args ) value_function.setup(self.world.state) # Record current state vehicle = self.world.state.vehicles[0] # Scooters is current cluster scooters = self.world.state.clusters[ vehicle.current_location.id ].get_swappable_scooters() deliver_scooter = random.choice( [ scooter for scooter in random.choice( [ cluster for cluster in self.world.state.clusters if cluster.id != vehicle.current_location.id ] ).scooters ] ) vehicle.pick_up(deliver_scooter) # Action that does a bit of everything action = classes.Action( [scooter.id for scooter in scooters[:3]], [scooter.id for scooter in scooters[3:10]], [deliver_scooter.id], random.choice( [ cluster.id for cluster in self.world.state.clusters if cluster.id != vehicle.current_location.id ] ), ) function_next_state_features = value_function.convert_next_state_features( self.world.state, vehicle, action ) self.world.state.do_action(action, vehicle, self.world.time) next_state_features = value_function.convert_state_to_features( self.world.state, vehicle ) self.assertEqual(len(function_next_state_features), len(next_state_features)) for i, value in enumerate(function_next_state_features): self.assertAlmostEqual( function_next_state_features[i], next_state_features[i], msg=f"not equal at {i}", )
def get_best_action(self, world, vehicle): # Choose a random cluster if vehicle.is_at_depot(): swappable_scooters_ids = [] number_of_scooters_to_swap = 0 else: # Find all scooters that can be swapped here swappable_scooters_ids = [ scooter.id for scooter in vehicle.current_location.get_swappable_scooters( battery_limit=70 ) ] # Calculate how many scooters that can be swapped number_of_scooters_to_swap = vehicle.get_max_number_of_swaps() if ( vehicle.battery_inventory - number_of_scooters_to_swap < vehicle.battery_inventory_capacity * 0.1 ) and not vehicle.is_at_depot(): next_location = world.state.depots[0] else: next_location = sorted( [ cluster for cluster in world.state.clusters if cluster.id != vehicle.current_location.id and cluster.id not in world.tabu_list ], key=lambda cluster: ( len(cluster.scooters) - cluster.get_current_state() ) / (len(cluster.scooters) + 1), reverse=True, )[0] # Return an action with no re-balancing, only scooter swapping return classes.Action( battery_swaps=swappable_scooters_ids[:number_of_scooters_to_swap], pick_ups=[], delivery_scooters=[], next_location=next_location.id, )
#root.mainline['11'] = c.Weapon() #root.mainline['12'] = c.Gear() #root.mainline['13'] = c.Food() #root.mainline['14'] = c.Magic() #root.mainline['15'] = c.Spell() #root.mainline['16'] = c.Skill() #root.mainline['17'] = c.Quest() #root.mainline['18'] = c.Encounter() #root.mainline['19'] = c.Location() #root.mainline['20'] = c.Building() #root.mainline['21'] = c.Lodging() root.structure['1'] = c.Root() # structural model root.structure['2'] = c.Actor() root.structure['3'] = c.Item() root.structure['4'] = c.Action() root.structure['5'] = c.Place() root.structure['6'] = c.Player() root.structure['7'] = c.NonPlayer() root.structure['8'] = c.Monster() root.structure['9'] = c.Animal() root.structure['10'] = c.Armour() root.structure['11'] = c.Weapon() root.structure['12'] = c.Gear() root.structure['13'] = c.Food() root.structure['14'] = c.Magic() root.structure['15'] = c.Spell() root.structure['16'] = c.Skill() root.structure['17'] = c.Quest() root.structure['18'] = c.Encounter() root.structure['19'] = c.Location()
def ann_learning(self, value_function): value_function.setup(self.world.state) self.world.LOST_TRIP_REWARD = -1 # Creating a list of states with associated negative reward simulation_state = copy.deepcopy(self.world.state) vehicle = simulation_state.vehicles[0] system_simulated_states = [] i = 0 # simulating to provoke lost demand while len(system_simulated_states) < 10: _, _, lost_demand = system_simulation.scripts.system_simulate( simulation_state ) # recording state and lost reward if there was lost demand after simulation if len(lost_demand) > 0: system_simulated_states.append( ( value_function.get_state_features( simulation_state, vehicle, i * globals.ITERATION_LENGTH_MINUTES, ), sum([lost_demand for lost_demand, _ in lost_demand]) * self.world.LOST_TRIP_REWARD, ) ) i += 1 # simulating doing actions that yields positive reward # (swap battery in clusters with available scooters less than ideal state) unsimulated_world = copy.deepcopy(self.world) accumulated_action_time = 0 unsimulated_states = [] # recording clusters with available scooters less than ideal state deficient_cluster = [ cluster for cluster in unsimulated_world.state.clusters if len(cluster.get_available_scooters()) < cluster.ideal_state ] counter = 0 vehicle = unsimulated_world.state.vehicles[0] # safety break if internal break doesn't apply while counter < len(deficient_cluster) and len(unsimulated_states) < 10: # swapping batteries on the n-th cluster in deficient cluster list cluster = deficient_cluster[counter] vehicle.battery_inventory = vehicle.battery_inventory_capacity vehicle.current_location = cluster # creating an action to swap all batteries and recording the state and reward action = classes.Action( [scooter.id for scooter in cluster.get_swappable_scooters()][ : vehicle.battery_inventory ], [], [], deficient_cluster[counter + 1].id, ) reward = action.get_reward( vehicle, 0, self.world.DEPOT_REWARD, self.world.VEHICLE_INVENTORY_STEP_SIZE, self.world.PICK_UP_REWARD, ) unsimulated_states.append( ( value_function.get_state_features( unsimulated_world.state, vehicle, accumulated_action_time ), reward, ) ) # calculating action distance and action time so it can be used when getting state features # (unnecessary, but have to use a time when creating state features) action_distance = unsimulated_world.state.get_distance( vehicle.current_location.id, action.next_location ) accumulated_action_time += unsimulated_world.state.do_action( action, vehicle, accumulated_action_time ) + action.get_action_time(action_distance) counter += 1 # training two times on the positive and negative rewarded states for _ in range(2): for i in range(len(system_simulated_states) - 1): state_features, reward = system_simulated_states[i] next_state_features = system_simulated_states[i + 1][0] update_value_function( value_function, state_features, next_state_features, reward ) for i in range(len(unsimulated_states) - 1): state_features, reward = unsimulated_states[i] next_state_features = unsimulated_states[i + 1][0] update_value_function( value_function, state_features, next_state_features, reward ) # check if the ann predicts higher value for the positively rewarded state then the negative one self.assertGreater( value_function.estimate_value_from_state_features(unsimulated_states[0][0]), value_function.estimate_value_from_state_features( system_simulated_states[0][0] ), )
def get_best_action(self, world, vehicle): # Find all possible actions actions = world.state.get_possible_actions( vehicle, divide=self.get_possible_actions_divide, exclude=world.tabu_list, time=world.time, number_of_neighbours=self.number_of_neighbors, ) state = world.state cache = EpsilonGreedyValueFunctionPolicy.get_cache(state) # Get state representation of current state state_features = self.value_function.get_state_features( world.state, vehicle, cache ) # Epsilon greedy choose an action based on value function if self.epsilon > random.rand(): best_action = random.choice(actions) else: # Create list containing all actions and their rewards and values (action, reward, value_function_value) action_info = [ ( classes.Action([], [], [], random.choice(world.state.locations).id), -1000, [], ) # No actions bug ] reward = 0 for action in actions: # look one action ahead forward_state: classes.State = copy.deepcopy(state) forward_vehicle: classes.Vehicle = forward_state.get_vehicle_by_id( vehicle.id ) # perform action forward_state.do_action(action, forward_vehicle, world.time) # Simulate the system to generate potential lost trips _, _, lost_demands = system_simulation.scripts.system_simulate( forward_state ) # Record lost trip rewards reward = ( sum(map(lambda lost_trips: lost_trips[0], lost_demands)) if len(lost_demands) > 0 else 0 ) # Find all actions after taking the action moving the state to s_{t+1} next_action_actions = forward_state.get_possible_actions( forward_vehicle, divide=self.get_possible_actions_divide, exclude=world.tabu_list + [action.next_location], time=world.time + action.get_action_time( state.get_distance( vehicle.current_location.id, forward_vehicle.current_location.id, ) ), number_of_neighbours=self.number_of_neighbors, ) cache = EpsilonGreedyValueFunctionPolicy.get_cache(forward_state) forward_action_info = [] for next_state_action in next_action_actions: # Generate the features for this new state after the action next_state_features = self.value_function.get_next_state_features( forward_state, forward_vehicle, next_state_action, cache, ) # Calculate the expected future reward of being in this new state next_state_value = ( self.value_function.estimate_value_from_state_features( next_state_features ) ) # Add the transition to a list for later evaluation forward_action_info.append( (next_state_action, next_state_value, next_state_features) ) # find the greedy best next action best_next_action, next_state_value, next_state_features = max( forward_action_info, key=lambda pair: pair[1] ) # Add this transition for later evaluation action_info.append( ( action, next_state_value + reward * world.LOST_TRIP_REWARD, next_state_features, ) ) # Choose the action with the highest value and reward best_action, next_state_value, next_state_features = max( action_info, key=lambda pair: pair[1] ) if not world.disable_training: if self.value_function.use_replay_buffer(): self.value_function.train(world.REPLAY_BUFFER_SIZE) else: self.value_function.train( ( state_features, reward * world.LOST_TRIP_REWARD, next_state_features, ) ) return best_action, state_features
def get_best_action(self, world, vehicle): vehicle_has_scooter_inventory = len(vehicle.scooter_inventory) > 0 if vehicle.is_at_depot(): scooters_to_deliver = [] scooters_to_pickup = [] number_of_scooters_to_pick_up = 0 number_of_scooters_to_swap = 0 scooters_to_swap = [] else: # If vehicle has scooter inventory, deliver all scooters and swap all swappable scooters if vehicle_has_scooter_inventory: # Deliver all scooters in scooter inventory, and don't pick up any new scooters scooters_to_deliver = [ scooter.id for scooter in vehicle.scooter_inventory ] scooters_to_pickup = [] number_of_scooters_to_pick_up = 0 # Swap as many scooters as possible as this cluster most likely needs it swappable_scooters = vehicle.current_location.get_swappable_scooters() number_of_scooters_to_swap = min( vehicle.battery_inventory, len(swappable_scooters) ) scooters_to_swap = [scooter.id for scooter in swappable_scooters][ :number_of_scooters_to_swap ] else: # Pick up as many scooters as possible, the min(scooter capacity, deviation from ideal state) number_of_scooters_to_pick_up = max( min( vehicle.scooter_inventory_capacity - len(vehicle.scooter_inventory), vehicle.battery_inventory, len(vehicle.current_location.scooters) - vehicle.current_location.ideal_state, ), 0, ) scooters_to_pickup = [ scooter.id for scooter in vehicle.current_location.scooters ][:number_of_scooters_to_pick_up] # Do not swap any scooters in a cluster with a lot of scooters scooters_to_swap = [] number_of_scooters_to_swap = 0 # There are no scooters to deliver due to empty inventory scooters_to_deliver = [] def get_next_location_id(is_finding_positive_deviation): return sorted( [ cluster for cluster in world.state.clusters if cluster.id != vehicle.current_location.id and cluster.id not in world.tabu_list ], key=lambda cluster: len(cluster.get_available_scooters()) - cluster.ideal_state, reverse=is_finding_positive_deviation, )[0].id # If vehicles has under 10% battery inventory, go to depot. if ( vehicle.battery_inventory - number_of_scooters_to_swap - number_of_scooters_to_pick_up < vehicle.battery_inventory_capacity * 0.1 ) and not vehicle.is_at_depot(): next_location_id = world.state.depots[0].id else: """ If vehicle has scooter inventory upon arrival, go to new positive deviation cluster to pick up new scooters. If there are no scooter inventory, go to cluster where you can drop off scooters picked up in this cluster, ergo negative deviation cluster. If, however, you are in the depot, you should do the opposite as the depot does not change the scooter inventory. """ visit_positive_deviation_cluster_next = ( vehicle_has_scooter_inventory if not vehicle.is_at_depot() else not vehicle_has_scooter_inventory ) next_location_id = get_next_location_id( visit_positive_deviation_cluster_next ) return classes.Action( scooters_to_swap, scooters_to_pickup, scooters_to_deliver, next_location_id, )
def get_best_action(self, world, vehicle): return classes.Action([], [], [], 0)