def __init__(self, device, model, item_scorer, hcp=4): self.device = device self.cmd_memory = flist() self.item_scorer = item_scorer # self.navigator = Navigator(navigation_model) self.utils = None self.hcp = hcp self.step_count = 0 self.total_score = 0 self.current_score = 0 self.recipe = '' self.reading = False self.model = model self.description = 'nothing' self.description_updated = True self.inventory = 'nothing' self.inventory_updated = False self.info = None # added for KG part self.state = StateNAction() self.kg = KGDQN(params, self.state.all_actions).cuda() self.params = params self.num_frames = params['num_frames'] if params['scheduler_type'] == 'exponential': self.e_scheduler = ExponentialSchedule(self.num_frames, params['e_decay'], params['e_final']) elif params['scheduler_type'] == 'linear': self.e_scheduler = LinearSchedule(self.num_frames, params['e_final'])
def __init__(self, game, params): self.num_episodes = params['num_episodes'] self.state = StateNAction() self.update_freq = params['update_frequency'] self.filename = 'kgdqn_' + '_'.join( [str(v) for k, v in params.items() if 'file' not in str(k)]) logging.basicConfig(filename='logs/' + self.filename + '.log', filemode='w') logging.warning("Parameters", params) self.env = textworld.start(game) self.params = params if params['replay_buffer_type'] == 'priority': self.replay_buffer = GraphPriorityReplayBuffer( params['replay_buffer_size']) elif params['replay_buffer_type'] == 'standard': self.replay_buffer = GraphReplayBuffer( params['replay_buffer_size']) params['vocab_size'] = len(self.state.vocab_drqa) self.model = KGDQN(params, self.state.all_actions).cuda() if self.params['preload_weights']: self.model = torch.load(self.params['preload_file'])['model'] # model = nn.DataParallel(model) self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.env.compute_intermediate_reward() self.env.activate_state_tracking() self.num_frames = params['num_frames'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.losses = [] self.all_rewards = [] self.completion_steps = [] #priority fraction self.rho = params['rho'] if params['scheduler_type'] == 'exponential': self.e_scheduler = ExponentialSchedule(self.num_frames, params['e_decay'], params['e_final']) elif params['scheduler_type'] == 'linear': self.e_scheduler = LinearSchedule(self.num_frames, params['e_final'])
class HAgent: def __init__(self, device, model, item_scorer, hcp=4): self.device = device self.cmd_memory = flist() self.item_scorer = item_scorer # self.navigator = Navigator(navigation_model) self.utils = None self.hcp = hcp self.step_count = 0 self.total_score = 0 self.current_score = 0 self.recipe = '' self.reading = False self.model = model self.description = 'nothing' self.description_updated = True self.inventory = 'nothing' self.inventory_updated = False self.info = None # added for KG part self.state = StateNAction() self.kg = KGDQN(params, self.state.all_actions).cuda() self.params = params self.num_frames = params['num_frames'] if params['scheduler_type'] == 'exponential': self.e_scheduler = ExponentialSchedule(self.num_frames, params['e_decay'], params['e_final']) elif params['scheduler_type'] == 'linear': self.e_scheduler = LinearSchedule(self.num_frames, params['e_final']) def step(self, observation, info: Dict[str, Any], detailed_commands=False): """" :param observation: observation from the environment :param info: info dictionary from the environment. :return: One or multiple low level cmds that correspond to a single high level action and the model infos needed for the A2C learning. """ self.info = info self.reading = 'and start reading' in observation # retrieve the information about the inventory, description, recipe and location (different approaches for different HCPs) self.inventory, self.description = self._get_inventory_and_description( observation, info) inventory = [ self.remove_articles(inv.strip()) for inv in self.inventory.strip().split('\n') if not 'carrying' in inv ] self.recipe = self._get_recipe(observation) location = Navigator.extract_location(self.description) if (len(self.cmd_memory) == 0): self.state.step(self.description.strip(), pruned=self.params['pruned']) else: self.state.step(self.description.strip(), prev_action=self.cmd_memory[-1], pruned=self.params['pruned']) total_frames = 0 # have to update this somehow later epsilon = self.e_scheduler.value(total_frames) state_embedding, possible_commands = self.kg.act(self.state, epsilon) # nav_commands = self.navigator.get_navigational_commands(self.description) items = None if self._know_recipe(): # Invoke the neural model to determine from the recipe and inventory which items we need to pickup and # what actions need to performed on them to satisfy the recipe. items, utils = self.item_scorer(recipe=self.recipe, inventory=self.inventory) # update the needed utils self._update_util_locations(self.description, utils, location) # build the representation of the current game state (dictionary of strings) state_description = self.build_state_description( self.description, items, state_embedding, observation, inventory) # generate a list of possible commands for the current game state # possible_commands = self.get_commands(self.description, items, location, inventory, nav_commands) # ask the model for the next command score, prob, value, high_level_command, index = self.model( state_description, possible_commands) cmds = flist() # translate the chosen high level command to a (set of) low level commands cmds.append( self.command_to_action(command=high_level_command, items=items, inventory=inventory, description=self.description)) # save the learning necessary for the A2C update of the model learning_info = LearningInfo(score=score, prob=prob, value=value, action=high_level_command, index=index, possible_actions=possible_commands) self.reading = (high_level_command == 'examine cookbook') self.step_count += 1 self.cmd_memory.append(high_level_command) if detailed_commands: hl2ll = { hl_cmd: self.command_to_action(command=hl_cmd, items=items, inventory=inventory, description=self.description) for hl_cmd in possible_commands } return cmds, learning_info, hl2ll return cmds, learning_info def change_last_cmd(self, cmd): self.cmd_memory[-1] = cmd def _get_inventory_and_description(self, observation, info): """ Returns the inventory and description of the current game state. For HCP 0, we try to get the information from the observation. If it is not in there we do not update, i.e. the agent has only access to an old version of description/ inventory. """ if self.hcp > 0: # for hcp > 0 the inventory and description is in info description = info['description'] inventory = info['inventory'] else: # for hcp == 0 the information needs to be extracted (if possible) from the observation description = self._description_from_observation(observation) inventory = self._inventory_from_observation(observation) return inventory, description def _description_from_observation(self, observation): if '-=' and '=-' in observation: description = '-= ' + observation.split('-= ')[1] self.description_updated = True else: description = self.description self.description_updated = False return description def _inventory_from_observation(self, observation): if 'You are carrying' in observation: inventory = observation self.inventory_updated = True else: inventory = self.inventory self.inventory_updated = False return inventory def _update_util_locations(self, description, utils, location): """ If we see a needed util in a visited location (i.e. BBQ in the backyard), we store it in self.utils. """ if self.utils is None and utils is not None: self.utils = {u: None for u in utils} for util, loc in self.utils.items(): if loc is not None: continue if util in description: self.utils[util] = location def update_score(self, new_total_score): self.current_score = new_total_score - self.total_score self.total_score = new_total_score def _get_recipe(self, observation, explicit_recipe=None): """ Returns the recipe if possible. For HCP >=4 you can provide the info['extra.recipe'] as explicit recipe. Otherwise the observation is stored as the recipe if the last commmand was 'examine recipe' (=self.reading). """ recipe = '' if self.recipe == '': if explicit_recipe is not None: recipe = explicit_recipe else: if self.reading: recipe = '\nRecipe {}\n'.format( observation.split('\n\nRecipe ')[1].strip()) else: recipe = self.recipe return recipe def _know_recipe(self): return self.recipe != '' def command_to_action(self, command, items, inventory, description): """ Translates the high level command in a (set of) low level command. """ if command == 'drop unnecessary items': cmd = self.drop_unnecessary_items(items, inventory) # elif command == 'explore': # cmd = self.navigator.explore(description) elif command == 'take required items from here': cmd = self.take_all_required_items(items, description) elif command == 'open stuff': cmd = ['open fridge'] if self.hcp == 0: cmd += ['look'] # elif 'go to' in command: # cmd = self.navigator.go_to(place=command.split('go to ')[1]) elif 'prepare meal' in command: cmd = [command] if self.hcp == 0: cmd += ['inventory'] elif 'with' in command: cmd = [command] if self.hcp == 0: cmd += ['inventory'] else: cmd = [command] if len(cmd) == 0: cmd = ['look'] return cmd def get_commands(self, description, items, location, inventory, nav_commands): """ Builds a list of possible commands based on the current game state and the hcp of the agent. """ if self.hcp == 5: raise NotImplementedError('HCP 5 not supported anymore') elif self.hcp == 4: pass # return self._get_commands_hcp4(description, items, location, inventory) elif self.hcp >= 1: pass # return self._get_commands_hcp3(description, items, location, inventory) else: return self._get_commands_hcp0(description, items, location, inventory, nav_commands) def _get_commands_hcp0(self, description, items, location, inventory, nav_commands): cmds = self._get_commands_hcp3(description, items, location, inventory) # for hcp 0 we need to add the look and inventory command. if not self.description_updated: cmds += ['look'] if not self.inventory_updated: cmds += ['inventory'] cmds += nav_commands return cmds def _get_commands_hcp3(self, description, items, location, inventory): """ HCP 3 has the same commands as hcp4 as soon as it found the cookbook. """ if self._know_recipe(): return self._get_commands_hcp4(description, items, location, inventory) cmds = [] # cmds = ['explore'] if 'cookbook' in description: cmds.append('examine cookbook') # open fridge command if 'fridge' in description: cmds.append('open stuff') # if location != 'Kitchen' and 'Kitchen' in self.navigator.graph.keys(): # cmds.append('go to Kitchen') return cmds def _get_commands_hcp4(self, description, items, location, inventory): def get_drop_cmds(items, inventory): cmds = [] for inv_item in inventory: for item in list(items.item): if item in inv_item: cmds.append('drop {}'.format(item)) continue return cmds standard_cmds = [ 'drop unnecessary items', # 'explore', 'take required items from here' ] # navigation commands # navigation_cmds = ['go to {}'.format(loc) for loc in self.navigator.graph.keys() if loc in list(self.utils.values()) + ['Kitchen'] and loc != location] # drop commands: all items in the inventory (that are necessary for the recipe) can be dropped explicitly drop_cmds = get_drop_cmds(items, inventory) # pickup commands: If a knife is needed for the recipe and it is in the description of the current room -> add command pickup_util_cmds = [ 'take {}'.format(util) for util in self.utils.keys() if util in description and util == 'knife' ] # drop utils commands: add command to drop a carried util, e.g. knife drop_util_cmds = [ 'drop {}'.format(util) for util in self.utils.keys() if util in inventory ] # Recipe step commands: add the commands required for the recipe that were determined by the neural model recipe_step_cmds = [ cmd for sublist in [ item['recipe_steps'] for _, item in items.iterrows() if item['already_in_inventory'] ] for cmd in sublist ] recipe_step_cmds = [ cmd for cmd in recipe_step_cmds if cmd.split('with ')[1] in self.utils and self.utils[cmd.split('with ')[1]] == location ] # open fridge command if 'fridge' in description: recipe_step_cmds.append('open stuff') # Finishing commands: prepare meal and eat meal finishing_cmds = [] if 'meal' in inventory: finishing_cmds.append('eat meal') elif len( [item for sublist in list(items.recipe_steps) for item in sublist]) == 0 and location.lower() == 'kitchen': finishing_cmds.append('prepare meal') return standard_cmds + drop_cmds + pickup_util_cmds + drop_util_cmds + recipe_step_cmds + finishing_cmds def take_all_required_items(self, items, description): """ List of take commands for all the ingredients necessary (specified by neural model) that are present in current location. """ return [ 'take {}'.format(item) for (item, already_in_inventory ) in zip(items['item'], items['already_in_inventory']) if item in description and not already_in_inventory ] def drop_unnecessary_items(self, items, inventory): """ List of drop commands for all the unnecessary ingredients currently carried (specified by neural model). """ cmds = [] for carried_item in inventory: if not any([item in carried_item for item in list(items.item)]): cmds.append('drop {}'.format(carried_item)) return cmds def remove_articles(self, item): return item.replace('an ', '').replace('a ', '').replace('the ', '').strip() ### Input features construction def build_state_description(self, description, items, state_embedding, observation, inventory): """ Builds the string representation of the current state of the game. The state has 8 'features' that all are arbitrarily long strings. Some features come directly from the agent's observation, e.g. 'observation', 'description', 'location'. Others are constructed using the output of the neural item scorer model, e.g. 'missing itens', 'required utils'. """ state_description = { 'observation': observation.split('$$$$$$$')[-1].replace('\n\n', ' ').replace('\n', ' ').strip(), 'missing_items': self._get_missing_items(items), 'unnecessary_items': self._get_unnecessary_items(items, inventory), # 'location': location, 'description': self._get_description(description), 'previous_cmds': self._get_previous_cmds(length=10), 'required_utils': self._get_required_utils(items) # 'discovered_locations': self._get_discovered_locations(), } for key, descr in state_description.items(): state_description[key] = ' '.join([ word.lower() if word not in ['<SEP>', '<DIR>'] else word for word in descr.replace('.', '').replace(',', '').replace( '?', '').replace('!', '').replace(':', '').replace( ' ', ' ').strip().split() ]) state_description['state_embedding'] = state_embedding return state_description # def _get_discovered_locations(self): # # locations = list(self.navigator.graph.keys()) # locations = self.navigator.discovered_locations # if len(locations) == 0: # return 'nothing' # return ' <SEP> '.join(locations) def _get_required_utils(self, items): if items is None: return 'not determined yet' utils = [ '{} not found'.format(util) if location is None else '{} in {}'.format(util, location) for util, location in self.utils.items() ] if len(utils) == 0: return 'nothing' return ' <SEP> '.join(utils) def _get_previous_cmds(self, length): cmds = self.cmd_memory[::-1][:length] if len(cmds) == 0: return 'nothing' return ' <SEP> '.join(cmds) def _get_description(self, description): return description.replace('\n\n\n\n', ' ').replace('\n', ' ').strip() def _get_missing_items(self, items): if items is None: return 'not determined yet' descr = [] for _, item in items.iterrows(): if not item.already_in_inventory: descr.append(' <DIR> '.join([item['item']] + item.recipe_steps)) if len(descr) == 0: return 'nothing' return ' <SEP> '.join(descr) def _get_unnecessary_items(self, items, inventory): if items is None: return 'not determined yet' unnecessary_items = [] for carried_item in inventory: if not any([item in carried_item for item in list(items.item)]): unnecessary_items.append(carried_item) if len(unnecessary_items) == 0: return 'nothing' return ' <SEP> '.join(unnecessary_items)
class KGDQNTrainer(object): def __init__(self, game, params): self.num_episodes = params['num_episodes'] self.state = StateNAction() self.update_freq = params['update_frequency'] self.filename = 'kgdqn_' + '_'.join([str(v) for k, v in params.items() if 'file' not in str(k)]) logging.basicConfig(filename='logs/' + self.filename + '.log', filemode='w') logging.warning("Parameters", params) self.env = textworld.start(game) self.params = params if params['replay_buffer_type'] == 'priority': self.replay_buffer = GraphPriorityReplayBuffer(params['replay_buffer_size']) elif params['replay_buffer_type'] == 'standard': self.replay_buffer = GraphReplayBuffer(params['replay_buffer_size']) params['vocab_size'] = len(self.state.vocab_drqa) self.model = KGDQN(params, self.state.all_actions).to(device) if self.params['preload_weights']: self.model = torch.load(self.params['preload_file'])['model'] # model = nn.DataParallel(model) self.optimizer = optim.Adam(self.model.parameters(), lr=params['lr']) self.env.compute_intermediate_reward() self.env.activate_state_tracking() self.num_frames = params['num_frames'] self.batch_size = params['batch_size'] self.gamma = params['gamma'] self.losses = [] self.all_rewards = [] self.completion_steps = [] #priority fraction self.rho = params['rho'] if params['scheduler_type'] == 'exponential': self.e_scheduler = ExponentialSchedule(self.num_frames, params['e_decay'], params['e_final']) elif params['scheduler_type'] == 'linear': self.e_scheduler = LinearSchedule(self.num_frames, params['e_final']) def plot(self, frame_idx, rewards, losses, completion_steps): fig = plt.figure(figsize=(20, 5)) plt.subplot(131) plt.title('frame %s. avg reward: %s' % (frame_idx, np.mean(rewards[-10:]))) plt.plot(rewards) plt.subplot(132) plt.title('frame %s. avg steps: %s' % (frame_idx, np.mean(completion_steps[-10:]))) plt.plot(completion_steps) plt.subplot(133) plt.title('loss-kgdqn') plt.plot(losses) plt.figtext(0.5, 0.01, self.filename, wrap=True, horizontalalignment='center', fontsize=12) fig.savefig('plots/' + self.filename + '_' + str(frame_idx) + '.png') #plt.show() def compute_td_loss(self): state, action, reward, next_state, done = self.replay_buffer.sample(self.batch_size, self.rho) reward = torch.FloatTensor(reward).to(device) done = torch.FloatTensor(1 * done).to(device) action_t = torch.LongTensor(action).to(device) q_value = self.model.forward_td_init(state, action_t)[0][0] with torch.no_grad(): #Loop through all feasible actions for fwd actions = torch.LongTensor([a.pruned_actions_rep for a in list(next_state)]).to(device) fwd_init, sts = self.model.forward_td_init(next_state, actions[:, 0, :])#.unsqueeze_(0) next_q_values = fwd_init[0].unsqueeze_(0) for i in range(1, actions.size(1)): act = actions[:, i, :]#.squeeze() sts = sts.new_tensor(sts.data) cat_q = self.model.forward_td(sts, next_state, act)[0].unsqueeze_(0) next_q_values = torch.cat((next_q_values, cat_q), dim=0) next_q_values = next_q_values.transpose(0, 1) next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - (expected_q_value.data)).pow(2).mean() # clipped_loss = loss.clamp(-1.0, 1.0) #loss = loss.clamp(-1.0, 1.0) # right_gradient = clipped_loss * -1.0 # loss.backward(right_gradient.data.unsqueeze(1)[:, 0]) loss.backward() self.optimizer.step() self.optimizer.zero_grad() return loss def train(self): total_frames = 0 for e_idx in range(1, self.num_episodes + 1): print("Episode:", e_idx) logging.info("Episode:" + str(e_idx)) self.env.enable_extra_info('description') state = self.env.reset() self.state.step(state.description, pruned=self.params['pruned']) self.model.train() # print(state) episode_reward = 0 completion_steps = 0 episode_done = False prev_action = None for frame_idx in range(1, self.num_frames + 1): epsilon = self.e_scheduler.value(total_frames) action, picked = self.model.act(self.state, epsilon) action_text = self.state.get_action_text(action) logging.info('-------') logging.info(self.state.visible_state) logging.info('picked:' + str(picked)) logging.info(action_text) next_state, reward, done = self.env.step(action_text) #if next_state.intermediate_reward == 0: # reward += -0.1 #else: # reward += next_state.intermediate_reward reward += next_state.intermediate_reward reward = max(-1.0, min(reward, 1.0)) print(frame_idx,action_text,reward,epsilon) if reward != 0: #print(action_text, reward) print("!!", frame_idx,action_text,reward, picked, epsilon) logging.warning('--------') logging.warning(frame_idx) logging.warning(self.state.visible_state) logging.warning(action_text) logging.warning(reward) episode_reward += reward completion_steps += 1 total_frames += 1 if done: logging.warning("Done") self.all_rewards.append(episode_reward) self.completion_steps.append(completion_steps) episode_reward = 0 completion_steps = 0 break elif frame_idx == self.num_frames: self.all_rewards.append(episode_reward) self.completion_steps.append(completion_steps) episode_reward = 0 completion_steps = 0 state = self.state self.state.step(next_state.description, prev_action=prev_action, pruned=self.params['pruned']) prev_action = action_text self.replay_buffer.push(state, action, reward, self.state, done) if len(self.replay_buffer) > self.batch_size: if frame_idx % self.update_freq == 0: loss = self.compute_td_loss() self.losses.append(loss.item()) # """ self.plot(e_idx, self.all_rewards, self.losses, self.completion_steps) if e_idx % (int(self.num_episodes / 500)) == 0: logging.info("Episode:" + str(e_idx)) # self.plot(frame_idx, self.all_rewards, self.losses, self.completion_steps) parameters = { 'model': self.model, 'replay_buffer': self.replay_buffer, 'action_dict': self.state.all_actions, 'vocab_drqa': self.state.vocab_drqa, 'vocab_kge': self.state.vocab_kge, 'params': self.params, 'stats': { 'losses': self.losses, 'rewards': self.all_rewards, 'completion_steps': self.completion_steps } } torch.save(parameters, 'models/' + self.filename + '_' + str(e_idx) + '.pt') parameters = { 'model': self.model, 'replay_buffer': self.replay_buffer, 'action_dict': self.state.all_actions, 'vocab_drqa': self.state.vocab_drqa, 'vocab_kge': self.state.vocab_kge, 'params': self.params, 'stats': { 'losses': self.losses, 'rewards': self.all_rewards, 'completion_steps': self.completion_steps } } torch.save(parameters, 'models/' + self.filename + '_final.pt') self.env.close()