def __init__(self, directory, parts=None): world_size = tuple(next(clevr_util.images_iter(directory=directory, parts=parts, mode='train')).shape[:2]) self.question_size = 0 self.answer_size = 0 words = set() for _, question, _, answer in clevr_util.questions_iter(directory=directory, parts=parts, mode='train'): question = util.string2tokens(string=question) answer = util.string2tokens(string=answer) self.question_size = max(self.question_size, len(question)) self.answer_size = max(self.answer_size, len(answer)) words.update(question) words.update(answer) words = sorted(words) super(CLEVRDataset, self).__init__(world_size=world_size, vectors=dict(question=self.question_size, answer=self.answer_size), words=words) self.clevr = {mode: clevr_util.clevr(directory=directory, parts=parts, mode=mode) for mode in ('train', 'validation', 'test')}
def realize(self, captions): try: ace = subprocess.Popen([self.ace_path, '-g', self.erg_path, '-1Te', '-r', 'root_strict'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: import sys from datetime import datetime print(datetime.now().strftime('%H:%M:%S')) print(e.strerror) print(sys.exc_info()[0]) raise dmrs_list = list() mrs_list = list() for caption in captions: dmrs = self.clause_dmrs(caption) dmrs.apply_paraphrases(self.post_processing.values()) dmrs.remove_underspecifications() dmrs_list.append(dmrs) mrs_list.append(dmrs.get_mrs() + '\n') stdout_data, stderr_data = ace.communicate(''.join(mrs_list).encode()) stderr_data = stderr_data.decode('utf-8').splitlines() stdout_data = stdout_data.decode('utf-8').splitlines() assert all(self.regex.match(line) for line in stderr_data), '\n\n' + '\n'.join('{}\n{}\n{}\n'.format(line, dmrs.dumps_xml().decode(), mrs) for line, dmrs, mrs in zip(stderr_data, dmrs_list, mrs_list) if not self.regex.match(line)) + '\nFailures: {}\n'.format(len(captions) - int(stderr_data[-2][16:stderr_data[-2].index(' ', 16)])) # self.proposition_dmrs(caption).dumps_xml() caption_strings = [line for line in stdout_data if line] assert len(caption_strings) == len(captions) for n, caption in enumerate(caption_strings): captions[n] = util.string2tokens(string=caption) return captions
def realize(self, captions): try: ace = subprocess.Popen([ self.ace_path, '-g', self.erg_path, '-1e', '-r', 'root_strict' ], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: import sys from datetime import datetime print(datetime.now().strftime('%H:%M:%S')) print(e.strerror) print(sys.exc_info()[0]) raise dmrs_list = list() mrs_list = list() for caption in captions: dmrs = self.caption_dmrs(caption=caption) dmrs = dmrs.apply_paraphrases(self.post_processing.values()) dmrs.remove_underspecifications() dmrs_list.append(dmrs) mrs_list.append(dmrs.get_mrs() + '\n') stdout_data, stderr_data = ace.communicate( input=''.join(mrs_list).encode()) stderr_data = stderr_data.decode('utf-8').splitlines() stdout_data = stdout_data.decode('utf-8').splitlines() failures = 0 n = 0 unexpected = False for line in stderr_data: if n == len(captions): assert self.final_regex.match(line), line continue if self.successful_regex.match(line): if unexpected: print(dmrs_list[n].dumps_xml().decode()) print(mrs_list[n]) unexpected = False n += 1 elif self.unsuccessful_regex.match(line): print(dmrs_list[n].dumps_xml().decode()) print(mrs_list[n]) failures += 1 n += 1 else: print('Unexpected: ' + line) unexpected = True if failures > 0: print('Failures: {}'.format(failures)) exit(0) caption_strings = [line for line in stdout_data if line] assert len(caption_strings) == len( captions), stdout_data + '\n' + stderr_data for n, caption in enumerate(caption_strings): captions[n] = util.string2tokens(string=caption) return captions
def __init__(self, directory): world_size = tuple( next(nlvr_util.images_iter(directory=directory, mode='train'))[1][0].shape[:2]) self.description_size = 0 words = set() for _, _, description, _ in nlvr_util.descriptions_iter( directory=directory, mode='train'): description = util.string2tokens(string=description) self.description_size = max(self.description_size, len(description)) words.update(description) words = sorted(words) super(NLVRDataset, self).__init__(world_size=world_size, vectors=dict(description=self.description_size), words=words) self.nlvr = { mode: nlvr_util.nlvr(directory=directory, mode=mode) for mode in ('train', 'validation', 'test') }
def descriptions_iter(directory, mode): mode = 'dev' if mode == 'validation' else mode path = os.path.join(directory, mode, mode + '.json') with open(path, 'r') as filehandle: for line in filehandle: line = line.strip() description_dict = json.loads(s=line) identifier = description_dict['identifier'] assert identifier[-2:] in ('-0', '-1', '-2', '-3') world_model1, world_model2, world_model3 = description_dict[ 'structured_rep'] description = description_dict['sentence'].lower() if description[-1] != '.': description += '.' description = util.string2tokens(string=description) agreement = description_dict['label'] assert agreement in ('true', 'false') agreement = (agreement == 'true') assert len( description_dict['evals']) == (1 if mode == 'train' else 5) assert len(description_dict) == 5 yield identifier, (world_model1, world_model2, world_model3), description, agreement
def questions_iter(directory, mode, parts=None): split = 'val' if mode == 'validation' else mode if parts is not None: split += parts[mode] path = os.path.join(directory, 'questions', 'CLEVR_{}_questions.json'.format(split)) with open(path, 'r') as filehandle: chars = filehandle.read(2) assert chars == '{"' chars = filehandle.read(1) while chars != 'q': while filehandle.read(1) != '"': pass chars = filehandle.read(3) assert chars == ': {' while filehandle.read(1) != '}': pass chars = filehandle.read(3) assert chars == ', "' chars = filehandle.read(1) chars = filehandle.read(11) assert chars == 'uestions": ' image_index = 0 for n, question_dict in enumerate(json_list_generator(fp=filehandle)): if image_index != question_dict['image_index']: image_index += 1 assert image_index == question_dict['image_index'] question = question_dict['question'].lower() if question[-1] != '?': question += '?' question = util.string2tokens(string=question) if mode == 'test': question_model = dict() answer = '[UNKNOWN]' else: family = question_dict['question_family_index'] program = question_dict['program'] question_model = dict(family=family, program=program) answer = question_dict['answer'].lower() if answer in numbers: answer = numbers[answer] answer = util.string2tokens(string=answer) assert question_dict['question_index'] == n assert question_dict['split'] == split assert question_dict[ 'image_filename'] == 'CLEVR_{}_{:0>6}.png'.format( split, image_index) assert len(question_dict) == 8 or (mode == 'test' and len(question_dict) == 5) yield image_index, question, question_model, answer chars = filehandle.read(1) while chars == ',': chars = filehandle.read(2) assert chars == ' "' while filehandle.read(1) != '"': pass chars = filehandle.read(3) assert chars == ': {' while filehandle.read(1) != '}': pass chars = filehandle.read(1) assert chars == '}' chars = filehandle.read() assert not chars