def create_subset_data(face_encoding_dir, src_dataset_dir, dst_dataset_dir): src_dst_copy_paths = [] mids = utils.get_dir_names(face_encoding_dir) for i, mid in enumerate(mids): file_path = os.path.join(face_encoding_dir, mid) img_fencoding_map = utils.load_json(file_path) file_names, face_encodings = [], [] for fname, fencoding in img_fencoding_map.items(): file_names.append(fname) face_encodings.append(np.array(fencoding)) similarites = [(fname, sim) for fname, sim in zip( file_names, calculate_similarity(face_encodings))] similarites.sort(key=lambda x: x[1], reverse=True) print("{}/{} Calculate similarity of mid {} done".format( i + 1, len(mids), mid)) # Remain number images corresponding with highest similarity if len(similarites) > 0: num_remain_images = math.ceil(similarites[0][1] * 100) for fname, _ in similarites[:num_remain_images]: src_path = os.path.join(src_dataset_dir, mid, fname) dst_path = os.path.join(dst_dataset_dir, mid, fname) src_dst_copy_paths.append((src_path, dst_path)) else: utils.make_dirs(os.path.join(dst_dataset_dir, mid)) num_success = utils.copy_files(src_dst_copy_paths) print("Create subset data (size = {}) from {} to {} done".format( num_success, src_dataset_dir, dst_dataset_dir))
def save_face_encoding(dataset_dir="../Temp/Dataset/Original", save_dir="../Temp/Dataset/Process"): start_time = time.time() save_dir = os.path.join(save_dir, "face_encodings") total_files = 0 dirs = utils.get_dir_names(parent_dir=dataset_dir) total_dirs = len(dirs) for i, dir in enumerate(dirs): fencoding_of_dir = _get_face_encodings(os.path.join(dataset_dir, dir)) fencoding_map = { fname: fencoding for fname, fencoding in fencoding_of_dir } total_files += len(fencoding_map) save_path = os.path.join(save_dir, dir) utils.save_json(fencoding_map, save_path) print("Calculate and Save {}/{} face encoding dir done".format( i + 1, total_dirs)) exec_time = time.time() - start_time print("\nCalculate face encodings of {} dirs and {} files in dir {} done". format(total_dirs, total_files, dataset_dir)) print("Save face encoding to dir {} done".format(save_dir)) print("Time : {:.2f} seconds".format(exec_time))
def copy_subset_args(): ap = argparse.ArgumentParser() ap.add_argument("--src_dataset_dir", required=True) ap.add_argument("--dst_dataset_dir", required=True) ap.add_argument("--min_imgs_per_class", "-mipc", default="50") ap.add_argument("--max_classes", default="500") args = vars(ap.parse_args()) src_dataset_dir = args["src_dataset_dir"] dst_dataset_dir = args["dst_dataset_dir"] min_imgs_per_class = int(args["min_imgs_per_class"]) max_classes = int(args["max_classes"]) num_classes = 0 for class_name in utils.get_dir_names(src_dataset_dir): file_names = utils.get_file_names(os.path.join(src_dataset_dir, class_name)) if len(file_names) >= min_imgs_per_class: num_classes += 1 src_dst_paths = [(os.path.join(src_dataset_dir, class_name, src_name), os.path.join(dst_dataset_dir, class_name, src_name)) for src_name in file_names] utils.copy_files(src_dst_paths) print("\nCopy {}/{} classes done".format(num_classes, max_classes)) if num_classes >= max_classes: break
def generate_batch(dataset_dir, batch_size=64, image_size=160): seq = iaa.Sequential([ iaa.Scale({"height": image_size, "width": image_size}), iaa.Fliplr(0.5, random_state=7), iaa.Affine(scale={"x": (0.9, 1.1), "y": (0.9, 1.1)}, rotate=(-15, 15)) ]) random.seed(7) mid_names = utils.get_dir_names(dataset_dir) random.shuffle(mid_names) map_mid_idx = {mid: 0 for mid in mid_names} map_mid_fpaths = {mid: utils.get_file_paths(os.path.join(dataset_dir, mid)) for mid in mid_names} mid_idx = 0 max_iter = 0 while True: x_batch1, x_batch2, y_batch = [], [], [] for i in range(int(batch_size / 2)): mid_name = mid_names[mid_idx] # Generate same class pair fpaths = map_mid_fpaths.get(mid_name) path_idx = map_mid_idx.get(mid_name) img1 = cv2.imread(fpaths[path_idx]) # seq.show_grid(img1, cols=8, rows=8) img2 = cv2.imread(fpaths[(path_idx + 1) % len(fpaths)]) x_batch1.append(seq.augment_image(img1)) x_batch2.append(seq.augment_image(img2)) y_batch.append(1) path_idx = (path_idx + 1) % len(fpaths) map_mid_idx.update({mid_name: path_idx}) if path_idx == 0: random.shuffle(fpaths) # Generate different class pair next_mid_name = mid_names[(mid_idx + 1) % len(mid_names)] fpaths = map_mid_fpaths.get(next_mid_name) img2 = cv2.imread(random.choice(fpaths)) x_batch1.append(seq.augment_image(img1)) x_batch2.append(seq.augment_image(img2)) y_batch.append(0) mid_idx = (mid_idx + 1) % len(mid_names) if mid_idx == 0: random.shuffle(mid_names) yield [np.array(x_batch1), np.array(x_batch2)], np.array(y_batch)
def _init_data(self): self.mid_name_map = project_utils.load_mid_name_map(self.mid_name_path) mids_train = utils.get_dir_names(self.training_data_dir) self.mid_class_map, self.class_mid_map = {}, {} for i, mid in enumerate(mids_train): self.mid_class_map.update({mid: i}) self.class_mid_map.update({i: mid}) self.num_classes = len(mids_train) if self.mode == "train": eda_save_dir = os.path.join(self.experiment_dir, "EDA_Result") calculate_class_distribution(self.training_data_dir, save_dir=eda_save_dir)
def split_dataset(src_dataset_dir, dst_dataset_dir, test_size=0.1, valid_size=0.0): start_time = time.time() train, valid, test = [], [], [] for dir in utils.get_dir_names(src_dataset_dir): fnames = utils.get_file_names(os.path.join(src_dataset_dir, dir)) num_fnames = len(fnames) num_test = int(math.ceil(test_size * num_fnames)) num_valid = int(math.ceil(valid_size * num_fnames)) num_train = num_fnames - num_test - num_valid random.shuffle(fnames) dir_fnames = [(dir, fname) for fname in fnames] train.extend(dir_fnames[:num_train]) valid.extend(dir_fnames[num_train:num_train + num_valid]) test.extend(dir_fnames[-num_test:]) train_dir = os.path.join(dst_dataset_dir, "Train") valid_dir = os.path.join(dst_dataset_dir, "Valid") test_dir = os.path.join(dst_dataset_dir, "Test") # Save new split dataset lst = [(train_dir, train), (valid_dir, valid), (test_dir, test)] src_dst_paths = [] for dst_parent_dir, dir_fnames in lst: for dir, fname in dir_fnames: src_path = os.path.join(src_dataset_dir, dir, fname) dst_path = os.path.join(dst_parent_dir, dir, fname) src_dst_paths.append((src_path, dst_path)) utils.copy_files(src_dst_paths) exec_time = time.time() - start_time print("\nSplit dataset from {} (size = {}) to :".format( src_dataset_dir, len(src_dst_paths))) print("---- {} (size = {})".format(train_dir, len(train))) print("---- {} (size = {})".format(valid_dir, len(valid))) print("---- {} (size = {})".format(test_dir, len(test))) print("Time : {:.2f} seconds".format(exec_time))
def _load_face_encodings(self): start_time = time.time() X_train, y_train = [], [] idx_fname_map = {} mids_train = utils.get_dir_names(self.training_data_dir) for mid in mids_train: self.face_encoding_map.update({mid: {}}) mid_dir = os.path.join(self.training_data_dir, mid) file_names = utils.get_file_names(mid_dir) fencoding_map_of_mid = load_face_encoding(self.face_encoding_dir, file_names=[mid ]).get(mid) # print("face_encoding_map of {} : {}".format(mid, list(fencoding_map_of_mid.keys()))) num_calculated_files = 0 for file_name in file_names: fencoding = fencoding_map_of_mid.get(file_name) if fencoding is None: # Calculate face encoding of this file name fencoding = get_face_encodings( image_path=os.path.join(mid_dir, file_name)) if len(fencoding) > 0: num_calculated_files += 1 fencoding_map_of_mid.update({file_name: fencoding}) if len(fencoding) > 0: self.face_encoding_map[mid].update({file_name: fencoding}) idx_fname_map.update({len(X_train): (mid, file_name)}) X_train.append(fencoding) y_train.append(self.mid_class_map.get(mid)) # Save face encoding if there is any encoding just calculated if num_calculated_files > 0: utils.save_json(fencoding_map_of_mid, os.path.join(self.face_encoding_dir, mid)) self.X_train, self.y_train = np.array(X_train), np.array(y_train) self.idx_fname_map = idx_fname_map exec_time = time.time() - start_time print("{}:: Load face encoding done. Time : {:.2f} seconds".format( self.class_name, exec_time))
def train(self): start_time = time.time() # Setup generator if self.is_siamese: train_generator = generate_batch(dataset_dir=self.train_dir, batch_size=self.batch_size, image_size=self.image_size) valid_generator = generate_batch(dataset_dir=self.valid_dir, batch_size=self.batch_size, image_size=self.image_size) self.num_classes = len(utils.get_dir_names(self.train_dir)) else: train_datagen = ImageDataGenerator( rescale=1. / 255, rotation_range=20, width_shift_range=0.2, height_shift_range=0.2, horizontal_flip=True, ) valid_datagen = ImageDataGenerator(rescale=1. / 255) train_generator = train_datagen.flow_from_directory( directory=self.train_dir, target_size=(self.image_size, self.image_size), batch_size=self.batch_size, ) valid_generator = valid_datagen.flow_from_directory( directory=self.valid_dir, target_size=(self.image_size, self.image_size), batch_size=self.batch_size, ) self.num_classes = len(train_generator.class_indices) optimizer = Adam if self.optimizer == "Adam": optimizer = Adam elif self.optimizer == "RMSProp": optimizer = RMSprop model = None # Check training from scratch or continue training if self.model_path is not None: model = load_model(self.model_path) else: if self.model_name == "VGG16": model_base = VGG16(include_top=False, input_shape=self.input_shape) elif self.model_name == "ResNet50": model_base = ResNet50(include_top=False, input_shape=self.input_shape) elif self.model_name == "DenseNet121": model_base = DenseNet121(include_top=False, input_shape=self.input_shape) elif self.model_name == "InceptionV3": model_base = InceptionV3(include_top=False, input_shape=self.input_shape) elif self.model_name == "InceptionResNetV2": model_base = InceptionResNetV2(include_top=False, input_shape=self.input_shape) elif self.model_name == "Xception": model_base = Xception(include_top=False, input_shape=self.input_shape) elif self.model_name == "Scratch": model_base = Sequential() model_base.add( Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=self.input_shape)) model_base.add( Conv2D(32, kernel_size=(3, 3), activation="relu")) model_base.add(MaxPool2D()) model_base.add( Conv2D(64, kernel_size=(3, 3), activation="relu")) model_base.add( Conv2D(64, kernel_size=(3, 3), activation="relu")) model_base.add(MaxPool2D()) model_base.add( Conv2D(128, kernel_size=(3, 3), activation="relu")) model_base.add( Conv2D(128, kernel_size=(3, 3), activation="relu")) model_base.add( Conv2D(128, kernel_size=(3, 3), activation="relu")) model_base.add(MaxPool2D()) model_base.add( Conv2D(256, kernel_size=(3, 3), activation="relu")) model_base.add( Conv2D(256, kernel_size=(3, 3), activation="relu")) model_base.add( Conv2D(256, kernel_size=(3, 3), activation="relu")) model_base.add(MaxPool2D()) self.num_trainable_layer = len(model_base.layers) else: print("Model name {} is not valid ".format(self.model_name)) return 0 # Freeze low layer for layer in model_base.layers[:-self.num_trainable_layer]: layer.trainable = False # Show trainable status of each layers print("\nAll layers of {} ".format(self.model_name)) for layer in model_base.layers: print("Layer : {} - Trainable : {}".format( layer, layer.trainable)) model = Sequential() model.add(model_base) model.add(Flatten()) # model.add(Dense(50, activation="relu")) # model.add(Dropout(0.25)) model.add(Dense(self.num_classes, activation="softmax")) # Compile model model.compile(loss="categorical_crossentropy", metrics=["acc"], optimizer=optimizer(lr=self.lr)) if self.is_siamese: model = get_siamese_model(model) model.compile(loss=contrastive_loss, metrics=[accuracy], optimizer=optimizer(lr=self.lr)) print("\nFinal model summary") model.summary() # classes = [_ for _ in range(self.num_classes)] # for c in train_generator.class_indices: # classes[train_generator.class_indices[c]] = c # # model.classes = classes # Define callbacks save_model_dir = os.path.join(self.save_dir, "Model_{}".format(self.model_name)) utils.make_dirs(save_model_dir) # loss_path = os.path.join(save_model_dir, "epochs_{epoch:02d}-val_loss_{val_loss:.2f}.h5") # loss_checkpoint = ModelCheckpoint( # filepath=loss_path, # monitor="val_loss", # verbose=1, # save_best_only=True # ) acc_path = os.path.join( save_model_dir, "epochs_{epoch:02d}-val_acc_{val_accuracy:.2f}.h5") acc_checkpoint = ModelCheckpoint(filepath=acc_path, monitor="val_accuracy", verbose=1, save_best_only=True) callbacks = [acc_checkpoint] # Train model print("Start train model from {} ...".format("{} pretrained".format( self.model_name) if self.model_path is None else self.model_path)) if self.is_siamese: history = model.fit_generator( generator=train_generator, steps_per_epoch=self.num_classes / self.batch_size, epochs=self.num_epochs, validation_data=valid_generator, validation_steps=self.num_classes / self.batch_size, callbacks=callbacks) else: history = model.fit_generator( generator=train_generator, steps_per_epoch=train_generator.samples / train_generator.batch_size, epochs=self.num_epochs, validation_data=valid_generator, validation_steps=valid_generator.samples / train_generator.batch_size, callbacks=callbacks) # Save model save_path = os.path.join(save_model_dir, "final_model.h5") model.save(save_path) # Save history acc, val_acc = history.history["acc"], history.history["val_acc"] loss, val_loss = history.history["loss"], history.history["val_loss"] train_stats = dict(Loss=loss, Valid_Loss=val_loss, Accuracy=acc, Valid_Accuracy=val_acc) df = pd.DataFrame(train_stats) save_path = os.path.join(self.save_dir, "History.csv") utils.save_csv(df, save_path) exec_time = time.time() - start_time print("\nTrain model {} done. Time : {:.2f} seconds".format( "{} pretrained".format(self.model_name) if self.model_path is None else self.model_path, exec_time))