def get_example_images(megadb_utils: MegadbUtils, dataset_name: str, class_name: str) -> List[Optional[str]]: """Gets SAS URLs for images of a particular class from a given dataset.""" datasets_table = megadb_utils.get_datasets_table() # this query should be fairly fast, ~1 sec query_both_levels = f''' SELECT TOP {NUMBER_SEQUENCES_TO_QUERY} VALUE seq FROM seq WHERE ARRAY_CONTAINS(seq.class, "{class_name}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{class_name}")) > 0 ''' sequences = megadb_utils.query_sequences_table( query_both_levels, partition_key=dataset_name) num_samples = min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES) sample_seqs = sample(sequences, num_samples) image_urls: List[Optional[str]] = [] for seq in sample_seqs: sample_image = sample(seq['images'], 1)[0] # sample 1 img per sequence img_path = MegadbUtils.get_full_path( datasets_table, dataset_name, sample_image['file']) img_path = urllib.parse.quote_plus(img_path) dataset_info = datasets_table[dataset_name] img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format( dataset_info["storage_account"], dataset_info["container"], img_path, dataset_info["container_sas_key"]) image_urls.append(img_url) num_missing = NUMBER_EXAMPLES_PER_SPECIES - len(image_urls) if num_missing > 0: image_urls.extend([None] * num_missing) assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES return image_urls
def get_example_images(megadb_utils, dataset_name, class_name): datasets_table = megadb_utils.get_datasets_table() query_both_levels = ''' SELECT TOP {} VALUE seq FROM seq WHERE ARRAY_CONTAINS(seq.class, "{}") OR (SELECT VALUE COUNT(im) FROM im IN seq.images WHERE ARRAY_CONTAINS(im.class, "{}")) > 0 '''.format(NUMBER_SEQUENCES_TO_QUERY, class_name, class_name) sequences = megadb_utils.query_sequences_table(query_both_levels, partition_key=dataset_name) sample_seqs = sample( sequences, min(len(sequences), NUMBER_EXAMPLES_PER_SPECIES)) # sample 7 sequences if possible image_urls = [] for i, seq in enumerate(sample_seqs): sample_image = sample(seq['images'], 1)[0] # sample one image from each sequence img_path = sample_image['file'] img_path = MegadbUtils.get_full_path(datasets_table, dataset_name, img_path) img_path = urllib.parse.quote_plus(img_path) dataset_info = datasets_table[dataset_name] img_url = 'https://{}.blob.core.windows.net/{}/{}{}'.format( dataset_info["storage_account"], dataset_info["container"], img_path, dataset_info["container_sas_key"]) image_urls.append(img_url) if len(image_urls) < NUMBER_EXAMPLES_PER_SPECIES: image_urls.extend([None] * (NUMBER_EXAMPLES_PER_SPECIES - len(image_urls))) assert len(image_urls) == NUMBER_EXAMPLES_PER_SPECIES return image_urls
def visualize_sequences(datasets_table, sequences, args): num_images = 0 images_html = [] rendering_info = [] for seq in sequences: if 'images' not in seq: continue # dataset and seq_id are required fields dataset_name = seq['dataset'] seq_id = seq['seq_id'] # sort the images in the sequence images_in_seq = sorted(seq['images'], key=lambda x: x['frame_num']) if len(seq['images']) > 1 else seq['images'] for im in images_in_seq: if args.trim_to_images_bboxes_labeled and 'bbox' not in im: continue num_images += 1 blob_path = MegadbUtils.get_full_path(datasets_table, dataset_name, im['file']) frame_num = im.get('frame_num', -1) im_class = im.get('class', None) if im_class is None: # if no class label on the image, show the class label on the sequence im_class = seq.get('class', []) rendering = {} rendering['blob_service'] = MegadbUtils.get_blob_service(datasets_table, dataset_name) rendering['container_name'] = datasets_table[dataset_name]['container'] rendering['blob_path'] = blob_path rendering['bbox'] = im.get('bbox', []) annotated_img_name = 'anno_' + blob_path.replace('/', args.pathsep_replacement).replace('\\', args.pathsep_replacement) rendering['annotated_img_name'] = annotated_img_name rendering_info.append(rendering) images_html.append({ 'filename': 'rendered_images/{}'.format(annotated_img_name), 'title': 'Seq ID: {}. Frame number: {}<br/> Image file: {}<br/> number of boxes: {}, image class labels: {}'.format(seq_id, frame_num, blob_path, len(rendering['bbox']), im_class), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) if num_images >= args.num_to_visualize: print('num_images visualized is {}'.format(num_images)) break # pool = ThreadPool() render_image_info_partial = partial(render_image_info, args=args) # print('len of rendering_info', len(rendering_info)) # tqdm(pool.imap_unordered(render_image_info_partial, rendering_info), total=len(rendering_info)) for rendering in tqdm(rendering_info): render_image_info_partial(rendering) print('Making HTML...') html_path = os.path.join(args.output_dir, 'index.html') # options = write_html_image_list() # options['headerHtml'] write_html_image_list( filename=html_path, images=images_html )
def visualize_incoming_annotations(args): print('Connecting to MegaDB to get the datasets table...') megadb_utils = MegadbUtils() datasets_table = megadb_utils.get_datasets_table() print('Loading the MegaDB entries...') with open(args.megadb_entries) as f: sequences = json.load(f) print(f'Total number of sequences: {len(sequences)}') dataset_seq_images = defaultdict(dict) for seq in sequences: dataset_seq_images[seq['dataset']][seq['seq_id']] = seq['images'] print('Loading incoming annotation entries...') incoming = IndexedJsonDb(args.incoming_annotation) print( f'Number of images in this annotation file: {len(incoming.image_id_to_image)}' ) if args.num_to_visualize != -1 and args.num_to_visualize <= len( incoming.image_id_to_image): incoming_id_to_anno = sample( list(incoming.image_id_to_annotations.items()), args.num_to_visualize) else: incoming_id_to_anno = incoming.image_id_to_annotations.items() # The file_name field in the incoming json looks like alka_squirrels.seq2020_05_07_25C.frame119221.jpg # we need to use the dataset, sequence and frame info to find the actual path in blob storage # using the sequences images_html = [] for image_id, annotations in tqdm(incoming_id_to_anno): if args.trim_to_images_bboxes_labeled and annotations[0][ 'category_id'] == 5: # category_id 5 is No Object Visible continue anno_file_name = incoming.image_id_to_image[image_id]['file_name'] parts = anno_file_name.split('.') dataset_name = parts[0] seq_id = parts[1].split('seq')[1] frame_num = int(parts[2].split('frame')[1]) im_rel_path = get_image_rel_path(dataset_seq_images, dataset_name, seq_id, frame_num) if im_rel_path is None: print(f'Not found in megadb entries: dataset {dataset_name},' f' seq_id {seq_id}, frame_num {frame_num}') continue im_full_path = megadb_utils.get_full_path(datasets_table, dataset_name, im_rel_path) # download the image container_client = megadb_utils.get_storage_client( datasets_table, dataset_name) downloader = container_client.download_blob(im_full_path) image_file = io.BytesIO() blob_props = downloader.download_to_stream(image_file) image = vis_utils.open_image(image_file) boxes = [anno['bbox'] for anno in annotations] classes = [anno['category_id'] for anno in annotations] vis_utils.render_iMerit_boxes(boxes, classes, image, label_map=incoming.cat_id_to_name) file_name = '{}_gtbbox.jpg'.format( os.path.splitext(anno_file_name)[0].replace('/', '~')) image = vis_utils.resize_image(image, args.output_image_width) image.save(os.path.join(args.output_dir, 'rendered_images', file_name)) images_html.append({ 'filename': '{}/{}'.format('rendered_images', file_name), 'title': '{}, number of boxes: {}'.format( anno_file_name, len([b for b in boxes if len(b) > 0])), 'textStyle': 'font-family:verdana,arial,calibri;font-size:80%;text-align:left;margin-top:20;margin-bottom:5' }) # Write to HTML images_html = sorted(images_html, key=lambda x: x['filename']) write_html_image_list(filename=os.path.join(args.output_dir, 'index.html'), images=images_html, options={ 'headerHtml': '<h1>Sample annotations from {}</h1>'.format( args.incoming_annotation) }) print('Visualized {} images.'.format(len(images_html)))