def main(): """ create RecordFileFormat files """ do_it() """im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' + '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')""" subprocess.check_output([ sys.executable, 'im2rec.py', '../data_set_files/record_format_files/filtered_data_set/val.lst', '.', '--no-shuffle', '--pass-through', '--pack-label' ]) subprocess.check_output([ sys.executable, 'im2rec.py', '../data_set_files/record_format_files/filtered_data_set/train.lst', '.', '--no-shuffle', '--pass-through', '--pack-label' ]) record_dataset = RecordFileDetection( '../data_set_files/record_format_files/filtered_data_set/val.rec', coord_normalized=True) record_dataset = RecordFileDetection( '../data_set_files/record_format_files/filtered_data_set/train.rec', coord_normalized=True) print('length:', len(record_dataset)) first_img = record_dataset[0][0] print('image shape:', first_img.shape) print('Label example:') print(record_dataset[0][1])
def get_dataset(train_dir, test_dir, args): train_dataset = RecordFileDetection(os.path.join( train_dir, 'birds_ssd_sample_train.rec'), coord_normalized=True) val_dataset = RecordFileDetection(os.path.join(test_dir, 'birds_ssd_sample_val.rec'), coord_normalized=True) val_metric = VOC07MApMetric(iou_thresh=0.5) if args.num_samples < 0: args.num_samples = len(train_dataset) if args.mixup: from gluoncv.data import MixupDetection train_dataset = MixupDetection(train_dataset) return train_dataset, val_dataset, val_metric
def main(): """ create RecordFileFormat files """ """im2rec = utils.download('https://raw.githubusercontent.com/apache/incubator-mxnet/' + '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py')""" if TEST: # subprocess.check_output([sys.executable, 'im2rec.py', '--list', '--no-shuffle', # '../data_set_files/record_format_files/data_set_test/test', '../data_set_files/test']) subprocess.check_output([ sys.executable, 'im2rec.py', '../data_set_files/record_format_files/data_set_test/test_min.lst', '.', '--no-shuffle', '--pass-through', '--pack-label' ]) record_dataset = RecordFileDetection( '../data_set_files/record_format_files/data_set_test/test_min.rec', coord_normalized=True) else: do_it() subprocess.check_output([ sys.executable, 'im2rec.py', '../data_set_files/record_format_files/data-set_min/val.lst', '.', '--no-shuffle', '--pass-through', '--pack-label' ]) subprocess.check_output([ sys.executable, 'im2rec.py', '../data_set_files/record_format_files/data-set_min/train.lst', '.', '--no-shuffle', '--pass-through', '--pack-label' ]) record_dataset = RecordFileDetection( '../data_set_files/record_format_files/data-set_min/val.rec', coord_normalized=True) record_dataset = RecordFileDetection( '../data_set_files/record_format_files/data-set_min/train.rec', coord_normalized=True) print('length:', len(record_dataset))
from gluoncv.data import RecordFileDetection record_dataset = RecordFileDetection('voc2007.rec', coord_normalized=True) # we expect same results from LstDetection print('length:', len(record_dataset)) first_img = record_dataset[3][0] print('image shape:', first_img.shape) print('Label example:') print(record_dataset[3][1]) record_dataset = RecordFileDetection('voc2007.rec') img, label = record_dataset[6] print('imgShape: ' + str(img.shape), 'labelShape: ' + str(label.shape))
im2rec = utils.download( 'https://raw.githubusercontent.com/apache/incubator-mxnet/' + '6843914f642c8343aaa9a09db803b6af6f5d94a2/tools/im2rec.py', 'im2rec.py') # In this tutorial we skip generating in subprocess but instead download a prepared val.rec # subprocess.check_output([sys.executable, 'im2rec.py', 'val', '.', '--no-shuffle', '--pass-through', '--pack-label']) utils.download( 'https://gist.github.com/zhreshold/599999eab290e951fcfb26cdd59885e2/raw/0d945eeea2a71ba7bd3e39d463f39921acb786d1/val.rec', 'val.rec') utils.download( 'https://gist.github.com/zhreshold/599999eab290e951fcfb26cdd59885e2/raw/0d945eeea2a71ba7bd3e39d463f39921acb786d1/val.idx', 'val.idx') ############################################################################## # Now similarly, we can create a dataset from the binary file we just created with on line of code: from gluoncv.data import RecordFileDetection record_dataset = RecordFileDetection('val.rec', coord_normalized=True) # we expect same results from LstDetection print('length:', len(record_dataset)) first_img = record_dataset[0][0] print('image shape:', first_img.shape) print('Label example:') print(record_dataset[0][1]) ############################################################################## # # .. _pascal_voc_like: # # 2. Derive from PASCAL VOC format # -------------------------------- # It you have a custom dataset fully comply with the `Pascal VOC <http://host.robots.ox.ac.uk/pascal/VOC/>`_ object detection format,
import argparse from gluoncv import utils as gcv_utils import numpy as np from gluoncv.data import RecordFileDetection from matplotlib import pyplot as plt parser = argparse.ArgumentParser() parser.add_argument('record_file') args = parser.parse_args() CLASS_NAMES = ['rook', 'knight', 'bishop', 'king', 'queen', 'pawn'] # USAGE: python greenseer/synthetic_generator/spotcheck.py ./data/syn-gen-images/chess_train.rec # Load record file from ".rec" and ".idx" files record_dataset = RecordFileDetection(args.record_file, coord_normalized=True) num_records = len(record_dataset) print(num_records) # Focus on a random record index = np.random.randint(0, num_records) random_record = record_dataset[index] img, label = random_record print('Type of image', type(img)) print('Type of label', type(label)) # Display bounding boxes together with labels ax = gcv_utils.viz.plot_bbox(img, label[:, :-1], labels=label[:, -1], class_names=CLASS_NAMES)
def pipe_detection_minibatch( epoch:int, batch_size:int=50, channel:str="/opt/ml/input/data/train", discard_partial_final:bool=False ): """Legacy generator method for batching RecordFileDetectors from SageMaker Pipe Mode stream This procedural method was explored before the cleaner approach of overriding dataset classes. The generator reads batches of records from a RecordIO stream, and converts each "stream-batch" into a GluonCV RecordFileDetection by buffering the records to an indexed local RecordIO file. Pros: - Doesn't require a length parameter up-front, because it really iterates through the stream - Can be used with shuffling transforms (which isn't necessary as SM can shuffle for you) - Total GluonCV compatibility, as it instantiates genuine GluonCV RecordFileDetection class Cons: - Need to set the stream batch size as a multiple of the minibatch size, which is fiddly - Stream is read in batches, which can still block processing on I/O - Introduces an outer loop in the training script - not consistent with standard patterns Example SageMaker input channel configuration: ``` train_channel = sagemaker.session.s3_input( f"s3://{BUCKET_NAME}/{DATA_PREFIX}/train.manifest", # SM Ground Truth output manifest content_type="application/x-recordio", s3_data_type="AugmentedManifestFile", record_wrapping="RecordIO", attribute_names=["source-ref", "annotations"], # To guarantee only 2 attributes fed in shuffle_config=sagemaker.session.ShuffleConfig(seed=1337) ) ``` ...SageMaker will produce a RecordIO stream with alternating records of image and annotation. This generator reads batches of records from the stream and converts each into a GluonCV RecordFileDetection. """ ixbatch = -1 epoch_end = False epoch_file = f"{channel}_{epoch}" epoch_records = recordio.MXRecordIO(epoch_file, "r") with TemporaryDirectory() as tmpdirname: batch_records_file = os.path.join(tmpdirname, "data.rec") batch_idx_file = os.path.join(tmpdirname, "data.idx") while not epoch_end: ixbatch += 1 logger.info(f"Epoch {epoch}, stream-batch {ixbatch}, channel {channel}") # TODO: Wish we could use with statements for file contexts, but I think MXNet can't? try: os.remove(batch_records_file) os.remove(batch_idx_file) except OSError: pass try: os.mknod(batch_idx_file) except OSError: pass # Stream batch of data in to temporary batch_records file (pair): batch_records = recordio.MXIndexedRecordIO(batch_idx_file, batch_records_file, "w") image_raw = None image_meta = None ixdatum = 0 invalid = False while (ixdatum < batch_size): # Read from the SageMaker stream: raw = epoch_records.read() # Determine whether this object is the image or the annotation: if (not raw): if (image_meta or image_raw): raise ValueError( f"Bad stream {epoch_file}: Finished with partial record {ixdatum}...\n" f"{'Had' if image_raw else 'Did not have'} image; " f"{'Had' if image_raw else 'Did not have'} annotations." ) epoch_end = True break elif (raw[0] == b"{"[0]): # Binary in Python is weird... logger.debug(f"Record {ixdatum} got metadata: {raw[:20]}...") if (image_meta): raise ValueError( f"Bad stream {epoch_file}: Already got annotations for record {ixdatum}...\n" f"Existing: {image_meta}\n" f"New: {raw}" ) else: image_meta = json.loads(raw) else: logger.debug(f"Record {ixdatum} got image: {raw[:20]}...") if (image_raw): raise ValueError( f"Bad stream {epoch_file}: Missing annotations for record {ixdatum}...\n" ) else: image_raw = raw # Since a stream-batch becomes an iterable GluonCV dataset, to which # downstream transformations are applied in bulk, it's best to weed out any # corrupted files here if possible rather than risk a whole mini-batch or # stream-batch getting discarded: try: img = image.imdecode(bytearray(raw)) logger.debug(f"Loaded image shape {img.shape}") except ValueError as e: logger.exception("Failed to load image data - skipping...") invalid = True # TODO: Since we already parse images, try to buffer the tensors not JPG # If both image and annotation are collected, we're ready to pack for GluonCV: if (image_raw is not None and len(image_raw) and image_meta): if invalid: image_raw = None image_meta = None invalid = False continue if (image_meta.get("image_size")): image_width = image_meta["image_size"][0]["width"] image_height = image_meta["image_size"][0]["height"] boxes = [[ ann["class_id"], ann["left"] / image_width, ann["top"] / image_height, (ann["left"] + ann["width"]) / image_width, (ann["top"] + ann["height"]) / image_height ] for ann in image_meta["annotations"]] else: logger.debug( "Writing non-normalized bounding box (no image_size in manifest)" ) boxes = [[ ann["class_id"], ann["left"], ann["top"], ann["left"] + ann["width"], ann["top"] + ann["height"] ] for ann in image_meta["annotations"]] boxes_flat = [ val for box in boxes for val in box ] header_data = [2, 5] + boxes_flat logger.debug(f"Annotation header data {header_data}") header = recordio.IRHeader( 0, # Convenience value not used # Flatten nested boxes array: header_data, ixdatum, 0 ) batch_records.write_idx(ixdatum, recordio.pack(header, image_raw)) image_raw = None image_meta = None ixdatum += 1 # Close the write stream (we'll re-open the file-pair to read): batch_records.close() if ixdatum == 0: logger.debug("Reached end of stream with no valid records - discarding") break if (epoch_end and discard_partial_final): logger.debug("Discarding final partial batch") break # (Don't yield the part-completed batch) dataset = RecordFileDetection(batch_records_file) logger.debug(f"Stream batch ready with {len(dataset)} records") if not len(dataset): raise ValueError( "Why is the dataset empty after loading as RecordFileDetection!?!?" ) yield dataset