def generate_test(output_dir, image_count, min_shapes=7, max_shapes=15): data = [] for i in range(image_count): bg = Background(ColorPool.default().random(count=2), (500, 500), "random") image = Image(bg, i) for _ in range(np.random.randint(min_shapes, max_shapes + 1)): image.add_shape(ShapePool.default().random().new( color=ColorPool.default().random(bg.colors))) image.save(os.path.join(output_dir, "images_test")) questions = QuestionsGroup().generate_all(image) show_progress(i, image_count, prefix="Test Generation") data.extend(questions) print() with open(os.path.join(output_dir, "questions_test.json"), "w") as file: json.dump(data, file) print("Test data file:", file.name)
def generate_train_validation(output_dir, image_repeats=1, balance_approach="group", validation_split_ratio=0.01): # generate data print() all_data = [] all_space = pool_iteration("colors", "shapes", "colors", image_repeats) total_iterations = ColorPool.default().colors_len() * \ ShapePool.default().shapes_len() * \ ColorPool.default().colors_len() * \ image_repeats for i, (bg_color, shape, shape_color, image_index) in enumerate(all_space): if bg_color == shape_color: continue # generate invalid image and question bg = Background(bg_color, (50, 50)) image = Image(bg) image.save(os.path.join(output_dir, "images")) questions = QuestionsGroup().generate_all(image) all_data.extend(questions) bg = Background(bg_color, (50, 50)) # generate valid image and question image = Image(bg, image_index + 1) if image_repeats > 1 else Image(bg) image.add_shape(shape.new(shape_color)) image.save(os.path.join(output_dir, "images")) questions = QuestionsGroup().generate_all(image) all_data.extend(questions) show_progress(i, total_iterations, prefix="Train Generation") print() print("Total generated questions:", len(all_data)) # balance data random.shuffle(all_data) data_table = {} for d in all_data: key = d["answer"][0] if key not in data_table: data_table[key] = [] data_table[key].append(d) answer_count = {k: len(v) for k, v in data_table.items()} print("Number of each answer:") for answer, count in answer_count.items(): print(answer, count) balanced_data = [] if balance_approach == "categorical": data_groups = { "colors": [c.name for c in pool_iteration("colors")] + ["unknown_color"], "shapes": [s.name for s in pool_iteration("shapes")] + ["unknown_shape"] } for gname, glist in data_groups.items(): print(gname, glist) print("Group minimums:") for gname, glist in data_groups.items(): group_min = min( [answer_count[a] if a in answer_count else 0 for a in glist]) print(gname, group_min) if group_min == 0: continue for a in glist: balanced_data.extend(data_table[a][:group_min]) elif balance_approach == "blindfold": answers_min_count = min([v for _, v in answer_count.items()]) for answer, qlist in data_table.items(): balanced_data.extend(qlist[:answers_min_count]) elif balance_approach == "none": balanced_data = all_data else: raise Exception( f"{balance_approach} is not a valid data balance approach") print("Total number of questions of balancing", len(balanced_data)) data_table = {} for d in balanced_data: key = d["answer"][0] if key not in data_table: data_table[key] = [] data_table[key].append(d) answer_count = [(k, len(v)) for k, v in data_table.items()] print("Number of each answer after balancing:") for answer, count in answer_count: print(answer, count) # spliting to validation and test from sklearn.model_selection import train_test_split data_train, data_validation = train_test_split( balanced_data, test_size=validation_split_ratio) print("Total number of train data", len(data_train)) print("Total number of validation data", len(data_validation)) with open(os.path.join(output_dir, "questions_train.json"), "w") as file: json.dump(data_train, file) print("Train data file:", file.name) with open(os.path.join(output_dir, "questions_validation.json"), "w") as file: json.dump(data_validation, file) print("Validation data file:", file.name)