def __init__(self, config, set_name, size=None): self.config = config self.size = size data_by_id = dict() data_by_layout_type = defaultdict(list) if set_name == "test": self.by_id = data_by_id self.by_layout_type = data_by_layout_type self.layout_types = set() return with open(STRING_FILE % set_name) as question_f, \ open(PARSE_FILE % set_name) as parse_f: questions = json.load(question_f)["questions"] parses = [l.strip() for l in parse_f] assert len(questions) == len(parses) pairs = zip(questions, parses) if size is not None: pairs = pairs[:size] for question, parse_str in pairs: id = question["question_id"] indexed_question = \ [STRING_INDEX.index(w) for w in question["question"].split()] parse = parse_tree(parse_str) layout = parse_to_layout(parse) if layout is None: continue image_id = question["image_id"] #if os.path.exists(IMAGE_FILE % (set_name, set_name, image_id)): try: datum = ImageDatum( id, indexed_question, layout, set_name, image_id, []) data_by_id[id] = datum except IOError as e: pass with open(ANN_FILE % set_name) as ann_f: annotations = json.load(ann_f)["annotations"] for ann in annotations: question_id = ann["question_id"] if question_id not in data_by_id: continue answer_counter = defaultdict(lambda: 0) for ans in ann["answers"]: ans_words = ans["answer"] if " " in ans_words or "/" in ans_words or "," in ans_words: continue answer_counter[ans_words] += 1 #ans_indexed = ANSWER_INDEX.index(ans_words) #data_by_id[question_id].outputs.append(ans_indexed) counted_answers = [(count, word) for word, count in answer_counter.items()] sorted_answers = sorted(counted_answers) if len(sorted_answers) == 0: del data_by_id[question_id] continue best_count = sorted_answers[-1][0] if best_count == 1: del data_by_id[question_id] continue best_answer = sorted_answers[-1][1] best_answer_indexed = ANSWER_INDEX.index(best_answer) data_by_id[question_id].outputs.append(best_answer_indexed) for datum in data_by_id.values(): data_by_layout_type[datum.layout.modules].append(datum) self.by_id = data_by_id self.by_layout_type = data_by_layout_type self.layout_types = data_by_layout_type.keys() logging.info("%s:", set_name.upper()) logging.info("%s items", len(self.by_id)) logging.info("%d answers", len(ANSWER_INDEX)) logging.info("%d layouts", len(self.layout_types)) logging.info("")
def __init__(self, config, set_name, size=None): self.config = config self.size = size data_by_id = dict() data_by_layout_type = defaultdict(list) if set_name == "test": self.by_id = data_by_id self.by_layout_type = data_by_layout_type self.layout_types = set() return with open(STRING_FILE % set_name) as question_f, \ open(PARSE_FILE % set_name) as parse_f: questions = json.load(question_f)["questions"] parses = [l.strip() for l in parse_f] assert len(questions) == len(parses) pairs = zip(questions, parses) if size is not None: pairs = pairs[:size] for question, parse_str in pairs: id = question["question_id"] indexed_question = \ [STRING_INDEX.index(w) for w in question["question"].split()] parse = parse_tree(parse_str) layout = parse_to_layout(parse) if layout is None: continue image_id = question["image_id"] #if os.path.exists(IMAGE_FILE % (set_name, set_name, image_id)): try: datum = ImageDatum(id, indexed_question, layout, set_name, image_id, []) data_by_id[id] = datum except IOError as e: pass with open(ANN_FILE % set_name) as ann_f: annotations = json.load(ann_f)["annotations"] for ann in annotations: question_id = ann["question_id"] if question_id not in data_by_id: continue answer_counter = defaultdict(lambda: 0) for ans in ann["answers"]: ans_words = ans["answer"] if " " in ans_words or "/" in ans_words or "," in ans_words: continue answer_counter[ans_words] += 1 #ans_indexed = ANSWER_INDEX.index(ans_words) #data_by_id[question_id].outputs.append(ans_indexed) counted_answers = [(count, word) for word, count in answer_counter.items()] sorted_answers = sorted(counted_answers) if len(sorted_answers) == 0: del data_by_id[question_id] continue best_count = sorted_answers[-1][0] if best_count == 1: del data_by_id[question_id] continue best_answer = sorted_answers[-1][1] best_answer_indexed = ANSWER_INDEX.index(best_answer) data_by_id[question_id].outputs.append(best_answer_indexed) for datum in data_by_id.values(): data_by_layout_type[datum.layout.modules].append(datum) self.by_id = data_by_id self.by_layout_type = data_by_layout_type self.layout_types = data_by_layout_type.keys() logging.info("%s:", set_name.upper()) logging.info("%s items", len(self.by_id)) logging.info("%d answers", len(ANSWER_INDEX)) logging.info("%d layouts", len(self.layout_types)) logging.info("")
def __init__(self, config, set_name): self.config = config data = set() data_by_layout_type = defaultdict(list) data_by_string_length = defaultdict(list) data_by_layout_and_length = defaultdict(list) if set_name == "val": self.data = data self.by_layout_type = data_by_layout_type self.by_string_length = data_by_string_length self.by_layout_and_length = data_by_layout_and_length return if set_name == "train": # TODO better index pred_counter = defaultdict(lambda: 0) with open(PARSE_FILE % set_name) as parse_f: for parse_str in parse_f: parse_preds = parse_str.strip() \ .replace("'", "") \ .replace("(", "") \ .replace(")", "") \ .split() for pred in parse_preds: pred_counter[pred] += 1 for pred, count in pred_counter.items(): if count <= 1: continue LAYOUT_INDEX.index(pred) with open(STRING_FILE % set_name) as question_f, \ open(PARSE_FILE % set_name) as parse_f, \ open(ANN_FILE % set_name) as ann_f, \ open(IMAGE_ID_FILE % set_name) as image_id_f: unked = 0 i = 0 for question, parse_str, answer, image_id in \ zip(question_f, parse_f, ann_f, image_id_f): question = question.strip() parse_str = parse_str.strip().replace("'", "") answer = answer.strip() image_id = int(image_id.strip()) words = question.split() words = ["<s>"] + words + ["</s>"] parse = parse_tree(parse_str) answer = ANSWER_INDEX.index(answer) words = [STRING_INDEX.index(w) for w in words] if len(parse) == 1: parse = parse + ("object",) layout = parse_to_layout(parse) #if i == 300: # continue i += 1 coco_set_name = "train" if set_name == "train" else "val" try: datum = CocoQADatum(words, layout, image_id, answer, coco_set_name) datum.raw_query = parse_str data.add(datum) data_by_layout_type[datum.layout.modules].append(datum) data_by_string_length[len(datum.string)].append(datum) data_by_layout_and_length[(datum.layout.modules, len(datum.string))].append(datum) except IOError as e: pass self.data = data self.by_layout_type = data_by_layout_type self.by_string_length = data_by_string_length self.by_layout_and_length = data_by_layout_and_length logging.info("%s:", set_name.upper()) logging.info("%s items", len(self.data)) logging.info("%s words", len(STRING_INDEX)) logging.info("%s functions", len(LAYOUT_INDEX)) logging.info("%s answers", len(ANSWER_INDEX)) logging.info("%s layouts", len(self.by_layout_type.keys())) logging.info("")
def __init__(self, config, set_name, filter_file=None): self.config = config size = config.train_size data = set() data_by_layout_type = defaultdict(list) data_by_string_length = defaultdict(list) data_by_layout_and_length = defaultdict(list) with open(STRING_FILE % (size, set_name)) as question_f, \ open(PARSE_FILE % (size, set_name)) as parse_f, \ open(ANN_FILE % (size, set_name)) as ann_f: img_filter = None if filter_file is not None: img_filter = set() with open(filter_file) as filt_h: for line in filt_h: img_filter.add(line.strip()) for question, parse_str, answer in zip(question_f, parse_f, ann_f): question = question.strip() parse_str = parse_str.strip() #parse_str = "(what object)" answer = answer.strip() words = question.split() image_id = words[-2] words = ["<s>"] + words[:-4] + ["</s>"] # TODO multi answer if "," in answer: continue if img_filter is not None and image_id not in img_filter: continue answer = ANSWER_INDEX.index(answer) indexed_words = [STRING_INDEX.index(w) for w in words] parse = parse_tree(parse_str) #if parse[0] != "color": # continue layout = parse_to_layout(parse) datum = DaquarDatum(indexed_words, layout, image_id, answer) data.add(datum) data_by_layout_type[datum.layout.modules].append(datum) data_by_string_length[len(datum.string)].append(datum) data_by_layout_and_length[(datum.layout.modules, len(datum.string))].append(datum) self.data = data self.by_layout_type = data_by_layout_type self.by_string_length = data_by_string_length self.by_layout_and_length = data_by_layout_and_length logging.info("%s:", set_name.upper()) logging.info("%s items", len(self.data)) logging.info("%s words", len(STRING_INDEX)) logging.info("%s functions", len(LAYOUT_INDEX)) logging.info("%s answers", len(ANSWER_INDEX)) logging.info("%s layouts", len(self.by_layout_type.keys())) logging.info("")
def __init__(self, config, set_name): self.config = config data = set() data_by_layout_type = defaultdict(list) data_by_string_length = defaultdict(list) data_by_layout_and_length = defaultdict(list) if set_name == "val": self.data = data self.by_layout_type = data_by_layout_type self.by_string_length = data_by_string_length self.by_layout_and_length = data_by_layout_and_length return if set_name == "train": # TODO better index pred_counter = defaultdict(lambda: 0) with open(PARSE_FILE % set_name) as parse_f: for parse_str in parse_f: parse_preds = parse_str.strip() \ .replace("'", "") \ .replace("(", "") \ .replace(")", "") \ .split() for pred in parse_preds: pred_counter[pred] += 1 for pred, count in pred_counter.items(): if count <= 1: continue LAYOUT_INDEX.index(pred) with open(STRING_FILE % set_name) as question_f, \ open(PARSE_FILE % set_name) as parse_f, \ open(ANN_FILE % set_name) as ann_f, \ open(IMAGE_ID_FILE % set_name) as image_id_f: unked = 0 i = 0 for question, parse_str, answer, image_id in \ zip(question_f, parse_f, ann_f, image_id_f): question = question.strip() parse_str = parse_str.strip().replace("'", "") answer = answer.strip() image_id = int(image_id.strip()) words = question.split() words = ["<s>"] + words + ["</s>"] parse = parse_tree(parse_str) answer = ANSWER_INDEX.index(answer) words = [STRING_INDEX.index(w) for w in words] if len(parse) == 1: parse = parse + ("object", ) layout = parse_to_layout(parse) #if i == 300: # continue i += 1 coco_set_name = "train" if set_name == "train" else "val" try: datum = CocoQADatum(words, layout, image_id, answer, coco_set_name) datum.raw_query = parse_str data.add(datum) data_by_layout_type[datum.layout.modules].append(datum) data_by_string_length[len(datum.string)].append(datum) data_by_layout_and_length[( datum.layout.modules, len(datum.string))].append(datum) except IOError as e: pass self.data = data self.by_layout_type = data_by_layout_type self.by_string_length = data_by_string_length self.by_layout_and_length = data_by_layout_and_length logging.info("%s:", set_name.upper()) logging.info("%s items", len(self.data)) logging.info("%s words", len(STRING_INDEX)) logging.info("%s functions", len(LAYOUT_INDEX)) logging.info("%s answers", len(ANSWER_INDEX)) logging.info("%s layouts", len(self.by_layout_type.keys())) logging.info("")