def dataset_inference_unlabeled(dataset_path, data_transforms, model_path, save_path, batch_size=64, force=False, seed=None, verbose=True): ''' Perform inference on an unlabeled dataset, using a csv Index file as reference. force: Boolean If force is False, search for an existing output file and use it, if it exists. If force is True or output file doesn't exist, compute dataset output and save to file. ''' if os.path.isfile(save_path) and not (force): outputDf = utils.load_pickle(save_path) if len(outputDf) > 0: return outputDf unlabelIndex = IndexManager(dataset_path) # Drop duplicated files unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index, "FrameHash") # Drop missing or corrupt images unlabelIndex.index = dutils.check_df_files(unlabelIndex.index, utils.check_empty_file, "FramePath") imagePathList = unlabelIndex.index["FramePath"].values datasetLen = len(imagePathList) if verbose: print("\nUnlabeled set inference") print("\nDataset information: ") print("\t", datasetLen, "images.") # Label list for an unlabeled dataset (bit of a hack? is there a better way?) labelList = np.zeros(datasetLen) outputDf = _model_inference(imagePathList, data_transforms, labelList, model_path, batch_size) ## Save output to pickle file if verbose: print("\nSaving outputs file to ", save_path) outputDf.to_pickle(save_path)
# TODO: Encapsulate sampledImages processing in function # Add FrameHash column if "imagem" in sampledIndex.columns: fileList = sampledIndex["imagem"].values elif "FrameName" in sampledIndex.columns: fileList = sampledIndex["FrameName"].values else: raise KeyError("DataFrame doesn't have a known image path column.") sampledIndex["FrameHash"] = utils.compute_file_hash_list(fileList, folder=dirs.febe_image_dataset) # Get missing information from original Unlabeled index sampledIndex = dutils.fill_index_information(originalUnlabeledIndex, sampledIndex, "FrameHash", [ 'rede1', 'rede2', 'rede3']) sampledIndex = dutils.remove_duplicates(sampledIndex, "FrameHash") autoIndex = dutils.remove_duplicates(autoIndex, "FrameHash") # TODO: Use merge_indexes in merge_manual_auto_sets mergedIndex = dutils.merge_manual_auto_sets(sampledIndex, autoIndex) print(mergedIndex.shape) mergedIndex.to_csv(mergedIndexPath, index=False) ## Create unlabeled set for next iteration # TODO: Encapsulate this section in function print("\nCreate new unlabeled set.") mergedPathList = [get_iter_folder(x) / \ "final_annotated_images_iteration_{}.csv".format(x) for x in range(1, iteration+1)] mergedIndexList = [pd.read_csv(x) for x in mergedPathList] originalUnlabeledIndex = pd.read_csv(originalUnlabeledIndexPath)
indexSampled = IndexManager(sampledIndexPath) if "imagem" in indexSampled.index.columns: indexSampled.index["FrameName"] = indexSampled.index["imagem"].copy() indexSampled.index["FramePath"] = indexSampled.index["FrameName"].map(_add_folder_path) eTime = indexSampled.compute_frame_hashes(reference_column="FramePath", verbose=True) indexSampled.write_index(dest_path=manualIndexPath, make_backup=False, prompt=False) ## Merge manual annotated labels from current and previous iterations if iteration > 1: oldLabels = pd.read_csv(prevManualIndexPath) newLabels = pd.read_csv(manualIndexPath) # Remove duplicates oldLabels = dutils.remove_duplicates(oldLabels, "FrameHash") newLabels = dutils.remove_duplicates(newLabels, "FrameHash") # Get additional information for newLabels from main unlabeled index # TODO: Don't do this again when merging auto and manual annotated indexes originalUnlabeledIndex = pd.read_csv(originalUnlabeledIndexPath) originalUnlabeledIndex = dutils.remove_duplicates(originalUnlabeledIndex, "FrameHash") newLabels = dutils.fill_index_information(originalUnlabeledIndex, newLabels, "FrameHash", [ 'rede1', 'rede2', 'rede3']) oldLabels = dutils.fill_index_information(originalUnlabeledIndex, oldLabels, "FrameHash", [ 'rede1', 'rede2', 'rede3']) mergedIndex = pd.concat([newLabels, oldLabels], axis=0, sort=False) mergedIndex.to_csv(manualIndexPath, index=False)
dirs.images) / "{}_results_samples".format(datasetName) loopFolder = Path(dirs.iter_folder) / datasetName referenceIndexPath = loopFolder / "iteration_0/reference_images.csv" previousLevelIndexPath = loopFolder / "iteration_0/unlabeled_images_iteration_0.csv" autoIndexFullPath = loopFolder / "final_automatic_images_{}.csv".format( datasetName) manualIndexFullPath = loopFolder / "final_manual_images_{}.csv".format( datasetName) annotatedIndexFullPath = loopFolder / "final_annotated_images_{}.csv".format( datasetName) reportPath = loopFolder / "annotation_report.txt" binaryDatasetPath = Path( dirs.iter_folder) / "dataset_rede_{}_eval_setup.csv".format(rede) referenceIndex = pd.read_csv(referenceIndexPath, low_memory=False) referenceIndex = dutils.remove_duplicates(referenceIndex, "FrameHash") previousLevelIndex = pd.read_csv(previousLevelIndexPath, low_memory=False) previousLevelIndex = dutils.remove_duplicates(previousLevelIndex, "FrameHash") # Get list of all iteration folders folderList = utils.make_path(glob(str(loopFolder) + "/iteration*")) tempList = copy(folderList) for path in tempList: if not (path.is_dir()): folderList.remove(path) if rede == 3: print("\nNet {}, target class {}.".format(rede, target_class)) else: print("\nNet {}.".format(rede))
rede = 1 batchSize = 64 unlabelIndexPath = Path(dirs.iter_folder) / \ "full_dataset/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration, iteration) savedModelsFolder = Path(dirs.saved_models) / "full_dataset_rede_{}/iteration_{}".format(rede, iteration) modelPath = savedModelsFolder / \ "full_dataset_no_finetune_{}_epochs_rede_{}_iteration_{}.pt".format(epochs, rede, iteration) savePath = savedModelsFolder / \ "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration) unlabelIndex = IndexManager(unlabelIndexPath) # Drop duplicated files unlabelIndex.index = dutils.remove_duplicates(unlabelIndex.index, "FrameHash") # Drop missing or corrupt images unlabelIndex.index = dutils.check_df_files(unlabelIndex.index, utils.check_empty_file, "FramePath") imagePathList = unlabelIndex.index["FramePath"].values datasetLen = len(imagePathList) print("\nDataset information: ") print("\t", datasetLen, "images.") # ImageNet statistics mean = commons.IMAGENET_MEAN std = commons.IMAGENET_STD
mergedIndex = pd.concat([manualIndex, autoIndex], axis=0, sort=False) return mergedIndex.copy() unlabeledPath = Path( dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv" manualPath = Path(dirs.iter_folder ) / "full_dataset/iteration_1/sampled_images_iteration_1.csv" autoPath = Path( dirs.iter_folder ) / "full_dataset/iteration_1/automatic_labeled_images_iteration_1.csv" manualIndex = pd.read_csv(manualPath) autoIndex = pd.read_csv(autoPath) manualIndex = dutils.remove_duplicates(manualIndex, "FrameHash") autoIndex = dutils.remove_duplicates(autoIndex, "FrameHash") # Get additional information for manualIndex from main unlabeled index # TODO: Do this as the second iteration step unlabeledIndex = pd.read_csv(unlabeledPath) unlabeledIndex = dutils.remove_duplicates(unlabeledIndex, "FrameHash") manualIndex = fill_index_information(unlabeledIndex, manualIndex, "FrameHash", ["rede1", "rede2", "rede3", "set"]) print(manualIndex.head()) print(manualIndex.shape) mergedIndex = merge_labeled_sets(manualIndex, autoIndex) print(mergedIndex.head()) print(mergedIndex.shape)
dirs.index) / "unlabeled_index_2019-8-18_19-32-37_HASHES.csv" sampledIndexPath = Path( dirs.iter_folder ) / "full_dataset/iteration_1/final_annotated_images_iteration_1.csv" newUnlabelIndexPath = Path( dirs.iter_folder ) / "full_dataset/iteration_2/unlabeled_images_iteration_2.csv" # Load model outputs and unlabeled images index # indexUnlabel = IndexManager(unlabelIndexPath) # indexSampled = IndexManager(sampledIndexPath) indexUnlabel = pd.read_csv(unlabelIndexPath) indexSampled = pd.read_csv(sampledIndexPath) print(indexUnlabel.index.shape) indexUnlabel = dutils.remove_duplicates(indexUnlabel, "FrameHash") indexUnlabel.set_index("FrameHash", drop=False, inplace=True) indexSampled.set_index("FrameHash", drop=False, inplace=True) print(indexUnlabel.index.duplicated().sum()) print(indexSampled.index.duplicated().sum()) newIndex = dutils.index_complement(indexUnlabel, indexSampled, "FrameHash") print(newIndex.shape) # indexUnlabel.write_index(newUnlabelIndexPath, prompt=False, backup=False) dirs.create_folder(newUnlabelIndexPath.parent) newIndex.to_csv(newUnlabelIndexPath, index=False)
print("\nConcatenating positive indexes...") compiledPositivesIndex = pd.concat(positivesList.values()) def _translate_dup_label(label_list): for priority_label in commons.rede_3_multiclass_priority_table: if priority_label in label_list: return priority_label return None # Split compiled index in duplicated and non-duplicated entries duplicatesMask = compiledPositivesIndex.duplicated( subset=commons.FRAME_HASH_COL_NAME, keep=False) duplicatesIndex = compiledPositivesIndex.loc[duplicatesMask, :] frameGroup = duplicatesIndex.groupby(commons.FRAME_HASH_COL_NAME) print("\nTranslating all duplicates...") compiledPositivesIndex = dutils.remove_duplicates(compiledPositivesIndex, commons.FRAME_HASH_COL_NAME) compiledPositivesIndex.set_index('FrameHash', drop=False, inplace=True) for frameHash, group in tqdm(frameGroup): newLabel = _translate_dup_label(group['rede3'].values) compiledPositivesIndex.loc[frameHash, 'rede3'] = newLabel compiledPositivesIndex.reset_index(drop=True, inplace=True) print(compiledPositivesIndex['rede3'][:30]) compiledPositivesIndex.to_csv(compiledPositivesPath, index=False)
"full_dataset_softmax/iteration_{}/unlabeled_images_iteration_{}.csv".format(iteration-1, iteration-1) savedModelsFolder = Path( dirs.saved_models) / "full_dataset_rede_{}_softmax/iteration_{}".format( rede, iteration) outputPath = savedModelsFolder / \ "outputs_full_dataset_{}_epochs_rede_{}_iteration_{}.pickle".format(epochs, rede, iteration) newIndexPath = Path(dirs.iter_folder) / \ "full_dataset/iteration_{}/automatic_labeled_images_iteration_{}.csv".format(iteration, iteration) idealUpperThresh = 0.8923 # Ratio 99% idealLowerThresh = 0.0904 # Ratio 1% indexDf = pd.read_csv(indexPath) pickleData = utils.load_pickle(outputPath) indexDf = dutils.remove_duplicates(indexDf, "FrameHash") outputs, imgHashes, _ = dutils.load_outputs_df(outputPath) outputs = outputs[:, 0] indexDf.set_index("FrameHash", drop=False, inplace=True) print("\nAutomatic labeling with upper positive ratio 99%:") posHashes, negHashes = dutils.automatic_labeling(outputs, imgHashes, idealUpperThresh, idealLowerThresh) newLabeledIndex = dutils.get_classified_index(indexDf, posHashes, negHashes, verbose=False)