def find_repeat_detections(inputFilename, outputFilename=None, options=None): ##%% Input handling if options is None: options = RepeatDetectionOptions() toReturn = RepeatDetectionResults() # Check early to avoid problems with the output folder if options.bWriteFilteringFolder or options.bRenderHtml: assert options.outputBase is not None and len(options.outputBase) > 0 os.makedirs(options.outputBase, exist_ok=True) # Load file detectionResults, otherFields = load_api_results( inputFilename, normalize_paths=True, filename_replacements=options.filenameReplacements) toReturn.detectionResults = detectionResults toReturn.otherFields = otherFields # Before doing any real work, make sure we can *probably* access images # This is just a cursory check on the first image, but it heads off most # problems related to incorrect mount points, etc. Better to do this before # spending 20 minutes finding repeat detections. if options.bWriteFilteringFolder or options.bRenderHtml: if not is_sas_url(options.imageBase): row = detectionResults.iloc[0] relativePath = row['file'] for s in options.filenameReplacements.keys(): relativePath = relativePath.replace( s, options.filenameReplacements[s]) assert os.path.isfile(os.path.join(options.imageBase, relativePath)) ##%% Separate files into directories # This will be a map from a directory name to smaller data frames rowsByDirectory = {} # This is a mapping back into the rows of the original table filenameToRow = {} # TODO: in the case where we're loading an existing set of FPs after manual filtering, # we should load these data frames too, rather than re-building them from the input. print('Separating files into directories...') # iRow = 0; row = detectionResults.iloc[0] for iRow, row in detectionResults.iterrows(): relativePath = row['file'] dirName = os.path.dirname(relativePath) if len(dirName) == 0: assert options.nDirLevelsFromLeaf == 0, 'Can' 't use the dirLevelsFromLeaf option with flat filenames' else: if options.nDirLevelsFromLeaf > 0: iLevel = 0 while (iLevel < options.nDirLevelsFromLeaf): iLevel += 1 dirName = os.path.dirname(dirName) assert len(dirName) > 0 if not dirName in rowsByDirectory: # Create a new DataFrame with just this row # rowsByDirectory[dirName] = pd.DataFrame(row) rowsByDirectory[dirName] = [] rowsByDirectory[dirName].append(row) assert relativePath not in filenameToRow filenameToRow[relativePath] = iRow # Convert lists of rows to proper DataFrames dirs = list(rowsByDirectory.keys()) for d in dirs: rowsByDirectory[d] = pd.DataFrame(rowsByDirectory[d]) toReturn.rowsByDirectory = rowsByDirectory toReturn.filenameToRow = filenameToRow print('Finished separating {} files into {} directories'.format( len(detectionResults), len(rowsByDirectory))) ##% Look for matches (or load them from file) dirsToSearch = list(rowsByDirectory.keys()) if options.debugMaxDir > 0: dirsToSearch = dirsToSearch[0:options.debugMaxDir] # length-nDirs list of lists of DetectionLocation objects suspiciousDetections = [None] * len(dirsToSearch) # Are we actually looking for matches, or just loading from a file? if len(options.filterFileToLoad) == 0: # We're actually looking for matches... print('Finding similar detections...') allCandidateDetections = [None] * len(dirsToSearch) if not options.bParallelizeComparisons: options.pbar = None # iDir = 0; dirName = dirsToSearch[iDir] for iDir, dirName in enumerate(tqdm(dirsToSearch)): allCandidateDetections[iDir] = find_matches_in_directory( dirName, options, rowsByDirectory) else: options.pbar = tqdm(total=len(dirsToSearch)) allCandidateDetections = Parallel( n_jobs=options.nWorkers, prefer='threads')(delayed(find_matches_in_directory)( dirName, options, rowsByDirectory) for dirName in tqdm(dirsToSearch)) print('\nFinished looking for similar bounding boxes') ##%% Find suspicious locations based on match results print('Filtering out repeat detections...') nImagesWithSuspiciousDetections = 0 nSuspiciousDetections = 0 # For each directory # # iDir = 51 for iDir in range(len(dirsToSearch)): # A list of DetectionLocation objects suspiciousDetectionsThisDir = [] # A list of DetectionLocation objects candidateDetectionsThisDir = allCandidateDetections[iDir] for iLocation, candidateLocation in enumerate( candidateDetectionsThisDir): # occurrenceList is a list of file/detection pairs nOccurrences = len(candidateLocation.instances) if nOccurrences < options.occurrenceThreshold: continue nImagesWithSuspiciousDetections += nOccurrences nSuspiciousDetections += 1 suspiciousDetectionsThisDir.append(candidateLocation) # Find the images corresponding to this bounding box, render boxes suspiciousDetections[iDir] = suspiciousDetectionsThisDir print( 'Finished searching for repeat detections\nFound {} unique detections on {} images that are suspicious' .format(nSuspiciousDetections, nImagesWithSuspiciousDetections)) else: print('Bypassing detection-finding, loading from {}'.format( options.filterFileToLoad)) # Load the filtering file detectionIndexFileName = options.filterFileToLoad sIn = open(detectionIndexFileName, 'r').read() suspiciousDetections = jsonpickle.decode(sIn) filteringBaseDir = os.path.dirname(options.filterFileToLoad) assert len(suspiciousDetections) == len(dirsToSearch) nDetectionsRemoved = 0 nDetectionsLoaded = 0 # We're skipping detection-finding, but to see which images are actually legit false # positives, we may be looking for physical files or loading from a text file. fileList = None if options.filteredFileListToLoad is not None: with open(options.filteredFileListToLoad) as f: fileList = f.readlines() fileList = [x.strip() for x in fileList] nSuspiciousDetections = sum([len(x) for x in suspiciousDetections]) print( 'Loaded false positive list from file, will remove {} of {} suspicious detections' .format(len(fileList), nSuspiciousDetections)) # For each directory # iDir = 0; detections = suspiciousDetections[0] # # suspiciousDetections is an array of DetectionLocation objects, # one per directory. for iDir, detections in enumerate(suspiciousDetections): bValidDetection = [True] * len(detections) nDetectionsLoaded += len(detections) # For each detection that was present before filtering # iDetection = 0; detection = detections[iDetection] for iDetection, detection in enumerate(detections): # Are we checking the directory to see whether detections were actually false # positives, or reading from a list? if fileList is None: # Is the image still there? imageFullPath = os.path.join( filteringBaseDir, detection.sampleImageRelativeFileName) # If not, remove this from the list of suspicious detections if not os.path.isfile(imageFullPath): nDetectionsRemoved += 1 bValidDetection[iDetection] = False else: if detection.sampleImageRelativeFileName not in fileList: nDetectionsRemoved += 1 bValidDetection[iDetection] = False # ...for each detection nRemovedThisDir = len(bValidDetection) - sum(bValidDetection) if nRemovedThisDir > 0: print('Removed {} of {} detections from directory {}'.format( nRemovedThisDir, len(detections), iDir)) detectionsFiltered = list(compress(detections, bValidDetection)) suspiciousDetections[iDir] = detectionsFiltered # ...for each directory print('Removed {} of {} total detections via manual filtering'.format( nDetectionsRemoved, nDetectionsLoaded)) # ...if we are/aren't finding detections (vs. loading from file) toReturn.suspiciousDetections = suspiciousDetections if options.bRenderHtml: # Render problematic locations with html (loop) print('Rendering html') nDirs = len(dirsToSearch) directoryHtmlFiles = [None] * nDirs if options.bParallelizeRendering: # options.pbar = tqdm(total=nDirs) options.pbar = None directoryHtmlFiles = Parallel( n_jobs=options.nWorkers, prefer='threads')(delayed(render_images_for_directory)( iDir, directoryHtmlFiles, suspiciousDetections, options) for iDir in tqdm(range(nDirs))) else: options.pbar = None # For each directory # iDir = 51 for iDir in range(nDirs): # Add this directory to the master list of html files directoryHtmlFiles[iDir] = render_images_for_directory( iDir, directoryHtmlFiles, suspiciousDetections, options) # ...for each directory # Write master html file masterHtmlFile = os.path.join(options.outputBase, 'index.html') os.makedirs(options.outputBase, exist_ok=True) toReturn.masterHtmlFile = masterHtmlFile with open(masterHtmlFile, 'w') as fHtml: fHtml.write('<html><body>\n') fHtml.write( '<h2><b>Repeat detections by directory</b></h2></br>\n') for iDir, dirHtmlFile in enumerate(directoryHtmlFiles): if dirHtmlFile is None: continue relPath = os.path.relpath(dirHtmlFile, options.outputBase) dirName = dirsToSearch[iDir] # Remove unicode characters before formatting relPath = relPath.encode('ascii', 'ignore').decode('ascii') dirName = dirName.encode('ascii', 'ignore').decode('ascii') fHtml.write('<a href={}>{}</a><br/>\n'.format( relPath, dirName)) fHtml.write('</body></html>\n') # ...if we're rendering html toReturn.allRowsFiltered = update_detection_table(toReturn, options, outputFilename) # Create filtering directory if options.bWriteFilteringFolder: print('Creating filtering folder...') dateString = datetime.now().strftime('%Y.%m.%d.%H.%M.%S') filteringDir = os.path.join(options.outputBase, 'filtering_' + dateString) os.makedirs(filteringDir, exist_ok=True) # iDir = 0; suspiciousDetectionsThisDir = suspiciousDetections[iDir] for iDir, suspiciousDetectionsThisDir in enumerate( tqdm(suspiciousDetections)): # suspiciousDetectionsThisDir is a list of DetectionLocation objects # iDetection = 0; detection = suspiciousDetectionsThisDir[0] for iDetection, detection in enumerate( suspiciousDetectionsThisDir): instance = detection.instances[0] relativePath = instance.filename outputRelativePath = 'dir{:0>4d}_det{:0>4d}_n{:0>4d}.jpg'.format( iDir, iDetection, len(detection.instances)) outputFullPath = os.path.join(filteringDir, outputRelativePath) if is_sas_url(options.imageBase): inputFullPath = relative_sas_url(options.imageBase, relativePath) else: inputFullPath = os.path.join(options.imageBase, relativePath) assert (os.path.isfile(inputFullPath) ), 'Not a file: {}'.format(inputFullPath) try: render_bounding_box(detection, inputFullPath, outputFullPath, lineWidth=options.lineThickness, expansion=options.boxExpansion) except Exception as e: print( 'Warning: error rendering bounding box from {} to {}: {}' .format(inputFullPath, outputFullPath, e)) if options.bFailOnRenderError: raise detection.sampleImageRelativeFileName = outputRelativePath # Write out the detection index detectionIndexFileName = os.path.join(filteringDir, DETECTION_INDEX_FILE_NAME) jsonpickle.set_encoder_options('json', sort_keys=True, indent=4) s = jsonpickle.encode(suspiciousDetections) with open(detectionIndexFileName, 'w') as f: f.write(s) toReturn.filterFile = detectionIndexFileName print('Done') # ...if we're writing filtering info return toReturn
def render_images_for_directory(iDir, directoryHtmlFiles, suspiciousDetections, options): nDirs = len(directoryHtmlFiles) if options.pbar is not None: options.pbar.update() if options.debugMaxRenderDir > 0 and iDir > options.debugMaxRenderDir: return None dirName = 'dir{:0>4d}'.format(iDir) # suspiciousDetectionsThisDir is a list of DetectionLocation objects suspiciousDetectionsThisDir = suspiciousDetections[iDir] if len(suspiciousDetectionsThisDir) == 0: return None timeStr = datetime.now().strftime('%H:%M:%S') print('Processing directory {} of {} ({})'.format(iDir, nDirs, timeStr)) dirBaseDir = os.path.join(options.outputBase, dirName) os.makedirs(dirBaseDir, exist_ok=True) directoryDetectionHtmlFiles = [] directoryDetectionImageInfo = [] # For each problematic detection in this directory # # iDetection = 0; detection = suspiciousDetectionsThisDir[iDetection]; nDetections = len(suspiciousDetectionsThisDir) bPrintedMissingImageWarning = False # iDetection = 0; detection = suspiciousDetectionsThisDir[0] for iDetection, detection in enumerate(suspiciousDetectionsThisDir): if options.debugMaxRenderDetection > 0 and iDetection > options.debugMaxRenderDetection: break nInstances = len(detection.instances) print('Processing detection {} of {} ({} instances)'.format( iDetection, nDetections, nInstances)) detectionName = 'detection{:0>4d}'.format(iDetection) detectionBaseDir = os.path.join(dirBaseDir, detectionName) os.makedirs(detectionBaseDir, exist_ok=True) # _ = pretty_print_object(detection) assert (nInstances >= options.occurrenceThreshold) imageInfo = [] # Render images # iInstance = 0; instance = detection.instances[iInstance] for iInstance, instance in enumerate(detection.instances): if options.debugMaxRenderInstance >= 0 and iInstance >= options.debugMaxRenderInstance: break imageRelativeFilename = 'image{:0>4d}.jpg'.format(iInstance) imageOutputFilename = os.path.join(detectionBaseDir, imageRelativeFilename) thisImageInfo = {} thisImageInfo['filename'] = imageRelativeFilename confidence = instance.confidence confidenceStr = '{:.2f}'.format(confidence) t = confidenceStr + ' (' + instance.filename + ')' thisImageInfo['title'] = t imageInfo.append(thisImageInfo) if not is_sas_url(options.imageBase): inputFileName = os.path.join(options.imageBase, instance.filename) if not os.path.isfile(inputFileName): if options.bPrintMissingImageWarnings: if (options.missingImageWarningType == 'all') or (not bPrintedMissingImageWarning): print('Warning: could not find file {}'.format( inputFileName)) bPrintedMissingImageWarning = True continue else: inputFileName = relative_sas_url(options.imageBase, instance.filename) render_bounding_box(detection, inputFileName, imageOutputFilename, lineWidth=options.lineThickness, expansion=options.boxExpansion) # ...for each instance # Write html for this detection detectionHtmlFile = os.path.join(detectionBaseDir, 'index.html') htmlOptions = write_html_image_list.write_html_image_list() htmlOptions['defaultImageStyle'] = 'max-width:650px;' write_html_image_list.write_html_image_list(detectionHtmlFile, imageInfo, htmlOptions) thisDirectoryImageInfo = {} directoryDetectionHtmlFiles.append(detectionHtmlFile) # Use the first image from this detection (arbitrary) as the canonical example # that we'll render for the directory-level page. thisDirectoryImageInfo['filename'] = os.path.join( detectionName, imageInfo[0]['filename']) detectionHtmlFileRelative = os.path.relpath(detectionHtmlFile, dirBaseDir) title = '<a href="{}">{}</a>'.format(detectionHtmlFileRelative, detectionName) thisDirectoryImageInfo['title'] = title directoryDetectionImageInfo.append(thisDirectoryImageInfo) # ...for each detection # Write the html file for this directory directoryHtmlFile = os.path.join(dirBaseDir, 'index.html') htmlOptions = write_html_image_list.write_html_image_list() htmlOptions['defaultImageStyle'] = 'max-width:650px;' write_html_image_list.write_html_image_list(directoryHtmlFile, directoryDetectionImageInfo, htmlOptions) return directoryHtmlFile