示例#1
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES
        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations()

        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()
示例#2
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        # when train the model labels==True
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['cls_token'])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])
        #         self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token'])

        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)
        self.dialogs = self._prepare_conversations(
        )  # get the parsed dialog data from dataset_walker
        # print("dialogs: ",self.dialogs[0])
        '''eg.
          [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, 
          {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}]
          '''
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)
        self.knowledge, self.snippets = self._prepare_knowledge()

        self._create_examples()
示例#3
0
    def __init__(self,
                 args,
                 tokenizer,
                 split_type,
                 labels=True,
                 labels_file=None):
        self.args = args
        self.dataroot = args.dataroot
        self.tokenizer = tokenizer
        self.split_type = split_type

        self.SPECIAL_TOKENS = SPECIAL_TOKENS
        self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES

        # Bert special tokens
        self.cls = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["cls_token"])
        self.sep = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS['sep_token'])

        self.bos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["bos_token"])
        self.eos = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["eos_token"])

        # PAD modified
        self.pad = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["pad_token"])
        self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids(
            self.SPECIAL_TOKENS["additional_special_tokens"])
        self.knowledge_sep_token = self.SPECIAL_TOKENS[
            "additional_special_tokens"][2]

        # dataset_walker.py
        #   self.logs: logs.json
        #   self.labels: labels.json

        ## if labels_file passed in, use the output of task1 (baseline.ktd.json)
        ## only has target: True / False
        self.dataset_walker = DatasetWalker(split_type,
                                            labels=labels,
                                            dataroot=self.dataroot,
                                            labels_file=labels_file)

        # self.dialogs: list of dictionary
        #   for train_baseline:
        #       format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},}
        #                {...},
        #                {...}]
        #       e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}}

        ##  for run_baseline: 'label' only has 'target'
        ##      format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},}
        ##               {...},
        ##               {...}]
        self.dialogs = self._prepare_conversations()

        # knowledge_reader.py
        #   self.knowledge: knowledge.json
        self.knowledge_reader = KnowledgeReader(self.dataroot,
                                                args.knowledge_file)

        # self.snippets: dictionary
        #   format: {key: value}
        #   key: 'domain__entity_id'
        #   value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens
        self.knowledge, self.snippets = self._prepare_knowledge()
        print("# of snippets = ", len(self.snippets.keys()))
        print('\n\n')

        self._create_examples()