def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations() self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples()
def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): # when train the model labels==True self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['cls_token']) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) # self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token']) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations( ) # get the parsed dialog data from dataset_walker # print("dialogs: ",self.dialogs[0]) '''eg. [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}] ''' self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples()
def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES # Bert special tokens self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["cls_token"]) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) # PAD modified self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] # dataset_walker.py # self.logs: logs.json # self.labels: labels.json ## if labels_file passed in, use the output of task1 (baseline.ktd.json) ## only has target: True / False self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) # self.dialogs: list of dictionary # for train_baseline: # format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},} # {...}, # {...}] # e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}} ## for run_baseline: 'label' only has 'target' ## format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},} ## {...}, ## {...}] self.dialogs = self._prepare_conversations() # knowledge_reader.py # self.knowledge: knowledge.json self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) # self.snippets: dictionary # format: {key: value} # key: 'domain__entity_id' # value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens self.knowledge, self.snippets = self._prepare_knowledge() print("# of snippets = ", len(self.snippets.keys())) print('\n\n') self._create_examples()