def _setup_data(self, opt): # check that the data was downloaded and set up properly self._check_data_downloaded(opt) # Load map from image ID to gender data = self._load_gender_data(opt['datatype']) extra_data = [] if self.add_unknown_classes: # load about data (unknown but inferred) extra_data = gend_utils.get_inferred_about_data( self.opt['task'], self.opt['datatype']) # now create partner/TO data: true neutral for ex in data: partner_ex = deepcopy(ex) partner_ex['labels'] = [f'PARTNER:{gend_utils.NEUTRAL}'] partner_ex['class_type'] = 'neutral' extra_data.append(ex) sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: to_samp = int(sample_rate * len(extra_data)) sampled = random.sample(extra_data, to_samp) data += sampled else: data += extra_data data = data + extra_data if self.is_train: random.shuffle(data) return data
def _setup_data(self, opt): build(opt) datatype = opt['datatype'] dt = datatype.split(':')[0] # Build a map from persona to gender persona_map = {} personas = json.load( open(os.path.join(opt['datapath'], PERSONA_PATH), 'rb'))['old'] for gender, lst in personas.items(): for x in lst: persona_map[int(x['char_id'])] = { 'name': x['name'], 'gender': gender } # Build a list of dialogue utterances and associated persona IDs light_world = pickle.load( open(os.path.join(opt['datapath'], LIGHT_DATA_PATH.format(dt)), 'rb')) utt_to_pers = [] for x in light_world: for act in x['conv_info']['acts']: text = act['text'] p_uid = act['id'].lower() self_char_id = None partner_char_id = None for y in x['conv_info']['characters']: # if identifying self utterances, grab your own id if y[0].lower() == p_uid: self_char_id = y[1]['id'] self_name = y[0].lower() # else grab the partner's id elif y[0].lower() != p_uid: partner_char_id = y[1]['id'] partner_name = y[0].lower() if self_char_id is not None and partner_char_id is not None: utt_to_pers.append({ 'text': text, 'self_id': self_char_id, 'self_name': self_name, 'partner_id': partner_char_id, 'partner_name': partner_name, }) self.data = [] missing = 0 counts = { 'partner': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, 'self': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, } for x in utt_to_pers: if x['self_id'] in persona_map and x['partner_id'] in persona_map: self_gender = persona_map[x['self_id']]['gender'] partner_gender = persona_map[x['partner_id']]['gender'] act = { 'text': x['text'].lower(), 'self_id': x['self_id'], 'partner_id': x['partner_id'], 'id': 'LIGHT Gender', 'episode_done': True, } if self_gender == gend_utils.NEUTRAL: # not True neutral self_gender = gend_utils.UNKNOWN if partner_gender == gend_utils.NEUTRAL: # not True neutral partner_gender = gend_utils.UNKNOWN if self_gender is not None and self.labels_to_use != 'partner': labels = [f'SELF:{self_gender}'] self_act = deepcopy(act) self_act['labels'] = labels self_act['class_type'] = 'self' self.data.append(self_act) if partner_gender is not None and self.labels_to_use != 'self': labels = [f'PARTNER:{partner_gender}'] partner_act = deepcopy(act) partner_act['labels'] = labels partner_act['class_type'] = 'partner' self.data.append(partner_act) counts['partner'][partner_gender] += 1 counts['self'][self_gender] += 1 else: missing += 1 if self.labels_to_use == 'all' and self.add_unknown_classes: # load about data all_about_data = gend_utils.get_inferred_about_data( self.opt['task'], self.opt) sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: # do something here to_samp = int(sample_rate * len(all_about_data)) sampled = random.sample(all_about_data, to_samp) self.data += sampled else: self.data += all_about_data total = len(self.data) print(f'Total: {total}') for x in ['self', 'partner']: print(f'Totals for {x}:') subtot = sum(counts[x].values()) for k, v in counts[x].items(): print(f'\t{k}: {v} ({v / subtot})')
def _setup_data(self, opt): counts = { 'partner': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, 'self': { gend_utils.UNKNOWN: 0, gend_utils.FEM: 0, gend_utils.MASC: 0 }, } dt = opt['datatype'].split(':')[0] if dt == 'test': warn_once('No test set; switching to valid') dt = 'valid' # build data print('[ Building data ... ]') new_eps = [] orig_teacher = OrigConvai2Teacher(opt) total_exs = orig_teacher.num_examples() num_exs = 0 while num_exs < total_exs: current_episode = [] episode_done = False while not episode_done: # TODO: eventually all teachers should return Messages, so # we should assert this action = Message(orig_teacher.act()) current_episode.append(action) episode_done = action.get('episode_done', False) num_exs += 1 # now we have the entire episode,... do something first_ex = current_episode[0] first_ex_text = [] partner_persona = [] your_persona = [] for line in first_ex['text'].split('\n'): # NOTE: we flip "your" and "partner" here since we are taking the 'text' # field instead of the 'label' if 'partner\'s persona: ' in line: your_persona.append(line.split('partner\'s persona: ')[1]) elif 'your persona: ' in line: partner_persona.append(line.split('your persona: ')[1]) else: first_ex_text.append(line) your, your_prob, partner, partner_prob = self.get_genders( your_persona, partner_persona) for i, ex in enumerate(current_episode): counts['self'][your] += 1 counts['partner'][partner] += 1 if i == 0: text = '\n'.join(first_ex_text) else: text = ex['text'] new_ex = { 'text': text, 'episode_done': True, 'your_persona': '\n'.join(your_persona), 'partner_persona': '\n'.join(partner_persona), 'id': 'ConvAI2 Gender', } if not self.use_probably: new_ex['partner_prob'] = partner_prob new_ex['your_prob'] = your_prob if your is not None and self.labels_to_use != 'partner': # Get the your task labels = [f'SELF:{your}'] your_ex = deepcopy(new_ex) your_ex['labels'] = labels your_ex['class_type'] = 'self' new_eps.append(your_ex) if partner is not None and self.labels_to_use != 'self': # Get the partner task labels = [f'PARTNER:{partner}'] partner_ex = deepcopy(new_ex) partner_ex['labels'] = labels partner_ex['class_type'] = 'partner' new_eps.append(partner_ex) if self.labels_to_use == 'all' and self.add_unknown_classes: # load about data all_about_data = gend_utils.get_inferred_about_data( self.opt['task'], self.opt) sample_rate = self.opt['unknown_temp'] if sample_rate < 1.0: to_samp = int(sample_rate * len(all_about_data)) sampled = random.sample(all_about_data, to_samp) new_eps += sampled else: new_eps += all_about_data if self.is_train: random.shuffle(new_eps) self.data = new_eps print(f'Missing cnt: {self.missing_cnt} / {len(self.data) * 2}') for x in ['self', 'partner']: print(f'Totals for {x}:') subtot = sum(counts[x].values()) for k, v in counts[x].items(): print(f'\t{k}: {v} ({v / subtot})')