def load_policy(agent_id): _facility = env.world.facilities[Utils.agentid_to_fid(agent_id)] if Utils.is_producer_agent(agent_id): return ProducerBaselinePolicy( env.observation_space, env.action_space_producer, BaselinePolicy.get_config_from_env(env)) elif isinstance(_facility, SKUStoreUnit) or isinstance( _facility, SKUWarehouseUnit): policy = ConsumerBaseStockPolicy( env.observation_space, env.action_space_consumer, BaselinePolicy.get_config_from_env(env)) policy.base_stock = sku_base_stocks[Utils.agentid_to_fid(agent_id)] return policy else: return ConsumerBaselinePolicy( env.observation_space, env.action_space_consumer, BaselinePolicy.get_config_from_env(env))
def action_dictionary_to_control(self, action_dict, world): actions_by_facility = defaultdict(list) for agent_id, action in action_dict.items(): f_id = Utils.agentid_to_fid(agent_id) actions_by_facility[f_id].append((agent_id, action)) controls = {} for f_id, actions in actions_by_facility.items(): controls[f_id] = self._actions_to_control(world.facilities[f_id], actions) return World.Control(facility_controls=controls)
def load_policy(agent_id): agent_echelon = env.world.agent_echelon[Utils.agentid_to_fid(agent_id)] if Utils.is_producer_agent(agent_id): policy_name = 'baseline_producer' else: if agent_echelon == total_echelon - 1: policy_name = 'ppo_store_consumer' else: if agent_echelon >= total_echelon-echelon_to_train: policy_name = 'ppo_warehouse_consumer' else: policy_name = 'baseline_consumer' return ppo_trainer.get_policy(policy_name)
def echelon_policy_map_fn(echelon, agent_id): facility_id = Utils.agentid_to_fid(agent_id) if Utils.is_producer_agent(agent_id): return 'baseline_producer' else: agent_echelon = env.world.agent_echelon[facility_id] if agent_echelon == 0: # supplier return 'baseline_consumer' elif agent_echelon == env.world.total_echelon - 1: # retailer return 'ppo_store_consumer' elif agent_echelon >= echelon: # warehouse and current layer is trainning or has been trained. return 'ppo_warehouse_consumer' else: # warehouse on layers that haven't been trained yet return 'baseline_consumer'
def eval(self, iter, eval_on_trainingset=False): self.switch_mode(eval=True) print(f" == eval iteration {iter} == ") obss = self.env.reset(eval=True, eval_on_trainingset=eval_on_trainingset) _, infos = self.env.state_calculator.world_to_state(self.env.world) rnn_states = {} rewards_all = {} episode_reward_all = {} episode_reward = {} episode_steps = [] episode_step = 0 tracker = SimulationTracker(self.env.done_step, 1, self.env.agent_ids()) for agent_id in obss.keys(): # policies[agent_id] = load_policy(agent_id) rnn_states[agent_id] = self.policies[agent_id].get_initial_state() rewards_all[agent_id] = [] episode_reward_all[agent_id] = [] episode_reward[agent_id] = 0 for i in range(100000): episode_step += 1 actions = {} # print("timestep : ", self.step) # print("Start calculate action ....") for agent_id, obs in obss.items(): policy = self.policies[agent_id] action, new_state, _ = policy.compute_single_action( obs, state=rnn_states[agent_id], info=infos[agent_id], explore=False) actions[agent_id] = action # print(agent_id, " :", policy.__class__, " : ", action) next_obss, rewards, dones, infos = self.env.step(actions) for agent_id, reward in rewards.items(): rewards_all[agent_id].append(reward) episode_reward[agent_id] += reward step_balances = {} for agent_id in rewards.keys(): step_balances[agent_id] = self.env.world.facilities[ Utils.agentid_to_fid( agent_id)].economy.step_balance.total() # print(env.world.economy.global_balance().total(), step_balances, rewards) tracker.add_sample(0, episode_step - 1, self.env.world.economy.global_balance().total(), step_balances, rewards) done = any(dones.values()) if done: obss = self.env.reset(eval=True) episode_steps.append(episode_step) episode_step = 0 for agent_id, reward in episode_reward.items(): episode_reward_all[agent_id].append(reward) episode_reward[agent_id] = 0 break else: obss = next_obss infos = { "rewards_all": rewards_all, "episode_reward_all": episode_reward_all, "epsilon": self.policies[self.policies_to_train[0]].epsilon, "all_step": self.step, "episode_step": sum(episode_steps) / len(episode_steps), "profit": tracker.get_retailer_profit(), } return infos
def visualization(env, policies, iteration, policy_mode, basestock=False): policy_mode = policy_mode # + f'_{iteration}' renderer = AsciiWorldRenderer() frame_seq = [] evaluation_epoch_len = env.env_config['evaluation_len'] starter_step = env.env_config['episod_duration']+env.env_config['tail_timesteps'] env.set_iteration(1, 1) # env.env_config.update({'episod_duration': evaluation_epoch_len, 'downsampling_rate': 1}) print( f"Environment: Producer action space {env.action_space_producer}, Consumer action space {env.action_space_consumer}, Observation space {env.observation_space}" , flush=True) obss = env.reset() if basestock: from scheduler.inventory_base_stock_policy import ConsumerBaseStockPolicy ConsumerBaseStockPolicy.facilities = env.world.facilities if Utils.get_demand_sampler()=='ONLINE': env.set_retailer_step(starter_step) _, infos = env.state_calculator.world_to_state(env.world) # policies = {} rnn_states = {} rewards = {} for agent_id in obss.keys(): # policies[agent_id] = load_policy(agent_id) rnn_states[agent_id] = policies[agent_id].get_initial_state() rewards[agent_id] = 0 # Simulation loop tracker = SimulationTracker(evaluation_epoch_len, 1, env.agent_ids()) print(f" === evaluation length {evaluation_epoch_len}, it will take about 1 min ....", flush=True) for epoch in range(evaluation_epoch_len): action_dict = {} for agent_id, obs in obss.items(): policy = policies[agent_id] action, new_state, _ = policy.compute_single_action(obs, state=rnn_states[agent_id], info=infos[agent_id], explore=False) action_dict[agent_id] = action # if agent_id.startswith('SKUStoreUnit') and Utils.is_consumer_agent(agent_id): # print(agent_id, action, rewards[agent_id]) # print(obs.tolist()) obss, rewards, dones, infos = env.step(action_dict) step_balances = {} for agent_id in rewards.keys(): step_balances[agent_id] = env.world.facilities[Utils.agentid_to_fid(agent_id)].economy.step_balance.total() # print(env.world.economy.global_balance().total(), step_balances, rewards) tracker.add_sample(0, epoch, env.world.economy.global_balance().total(), step_balances, rewards) # some stats stock_status = env.get_stock_status() order_in_transit_status = env.get_order_in_transit_status() demand_status = env.get_demand_status() tracker.add_sku_status(0, epoch, stock_status, order_in_transit_status, demand_status) frame = renderer.render(env.world) frame_seq.append(np.asarray(frame)) print(tracker.get_retailer_profit()) if not os.path.exists('output'): os.mkdir('output') if not os.path.exists('output/%s' % policy_mode): os.mkdir('output/%s' % policy_mode) if not os.path.exists(f'output/{policy_mode}/iter_{iteration}'): os.mkdir(f'output/{policy_mode}/iter_{iteration}') # tracker.render("output/%s/plot.png" % policy_mode) tracker.render(f'output/{policy_mode}/iter_{iteration}/plot.png') tracker.render_sku(policy_mode, iteration) print(f" === evaluation length end ", flush=True)