def collect_experiment(key, root_uri, output_dir, get_pred_package=False): print('\nCollecting experiment {}...\n'.format(key)) if root_uri.startswith('s3://'): predict_package_uris = list_paths(join(root_uri, key, 'bundle'), ext='predict_package.zip') eval_json_uris = list_paths(join(root_uri, key, 'eval'), ext='eval.json') else: predict_package_uris = glob.glob(join(root_uri, key, 'bundle', '*', 'predict_package.zip')) eval_json_uris = glob.glob(join(root_uri, key, 'eval', '*', 'eval.json')) if len(predict_package_uris) > 1 or len(eval_json_uris) > 1: print('Cannot collect from key with multiple experiments!!!') return if len(predict_package_uris) == 0 or len(eval_json_uris) == 0: print('Missing output!!!') return predict_package_uri = predict_package_uris[0] eval_json_uri = eval_json_uris[0] make_dir(join(output_dir, key)) if get_pred_package: download_or_copy(predict_package_uri, join(output_dir, key)) download_or_copy(eval_json_uri, join(output_dir, key)) eval_json = file_to_json(join(output_dir, key, 'eval.json')) pprint.pprint(eval_json['overall'], indent=4)
def process_sceneset_results(self, training_results, validation_results, tmp_dir): """After all scenes have been processed, process the result set. This writes a zip file for a group of scenes at {chip_uri}/{uuid}.zip containing: train/{scene_id}-{ind}.png train/{scene_id}-labels.json val/{scene_id}-{ind}.png val/{scene_id}-labels.json Args: training_results: dependent on the ml_backend's process_scene_data validation_results: dependent on the ml_backend's process_scene_data """ self.log_options() group = str(uuid.uuid4()) group_uri = join(self.backend_opts.chip_uri, '{}.zip'.format(group)) group_path = get_local_path(group_uri, tmp_dir) make_dir(group_path, use_dirname=True) with zipfile.ZipFile(group_path, 'w', zipfile.ZIP_DEFLATED) as zipf: def _write_zip(results, split): for scene_dir in results: scene_paths = glob.glob(join(scene_dir, '*')) for p in scene_paths: zipf.write(p, join(split, basename(p))) _write_zip(training_results, 'train') _write_zip(validation_results, 'valid') upload_or_copy(group_path, group_uri)
def get_local_path(uri, working_dir): """ This method will simply pass along the URI if it is local. If the URI is on S3, it will download the data to the working directory, in a structure that matches s3, and return the local path. If the local path already exists, and the timestamp of the S3 object is at or before the local path, the download will be skipped """ fs = FileSystem.get_file_system(uri) if fs is LocalFileSystem: return uri local_path = fs.local_path(uri, working_dir) do_copy = True if os.path.exists(local_path): last_modified = fs.last_modified(uri) if last_modified: # If thel local file is older than the remote file, download it. local_last_modified = datetime.utcfromtimestamp( os.path.getmtime(local_path)) if local_last_modified.replace( tzinfo=timezone.utc) > last_modified: do_copy = False else: # This FileSystem doesn't support last modified. # By default, don't download a new version. do_copy = False if do_copy: dir_name = os.path.dirname(local_path) make_dir(dir_name) fs.copy_from(uri, local_path) return local_path
def process_scene_data(self, scene, data, tmp_dir): """Process each scene's training data Args: scene: Scene data: TrainingData Returns: dictionary of Scene's classes and corresponding local directory path """ scratch_dir = join(tmp_dir, 'scratch-{}'.format(uuid.uuid4())) # Ensure directory is unique since scene id's could be shared between # training and test sets. scene_dir = join(scratch_dir, '{}-{}'.format(scene.id, uuid.uuid4())) class_dirs = {} for chip_idx, (chip, window, labels) in enumerate(data): class_id = labels.get_cell_class_id(window) # If a chip is not associated with a class, don't # use it in training data. if class_id is None: continue class_name = self.class_map.get_by_id(class_id).name class_dir = join(scene_dir, class_name) make_dir(class_dir) class_dirs[class_name] = class_dir chip_name = '{}.png'.format(chip_idx) chip_path = join(class_dir, chip_name) save_img(chip, chip_path) return class_dirs
def download_pretrained_model(self, pretrained_model_zip_uri): """Download pretrained model and unzip it. This is used before training a model. Args: pretrained_model_zip_uri: (string) URI of .tar.gz file containing pretrained model. This file is of the form that comes from the Model Zoo at https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md # noqa It contains a directory at the top level with the same name as root of the zip file (if zip file is x.tar.gz, the directory is x), and a set of files of the form model.ckpt.*. This file can be located anywhere, and is not expected to be in the directory encapsulated by this class that is generated by the make_chips command. That is why it is passed in separately. Returns: (string) path to pretrained model file (which is model.ckpt in the zip file) """ pretrained_model_zip_path = self.download_if_needed( pretrained_model_zip_uri) pretrained_model_dir = join(self.temp_dir, 'pretrained_model') make_dir(pretrained_model_dir) with tarfile.open(pretrained_model_zip_path, 'r:gz') as tar: tar.extractall(pretrained_model_dir) model_name = os.path.splitext( os.path.splitext(os.path.basename(pretrained_model_zip_uri))[0])[0] # The unzipped file is assumed to have a single directory with # the name of the model derived from the zip file. pretrained_model_path = join(pretrained_model_dir, model_name, 'model.ckpt') return pretrained_model_path
def process_scene_data(self, scene, data, tmp_dir): """Process each scene's training data. This writes {scene_id}/img/{scene_id}-{ind}.png and {scene_id}/labels/{scene_id}-{ind}.png Args: scene: Scene data: TrainingData Returns: backend-specific data-structures consumed by backend's process_sceneset_results """ scene_dir = join(tmp_dir, str(scene.id)) img_dir = join(scene_dir, 'img') labels_dir = join(scene_dir, 'labels') make_dir(img_dir) make_dir(labels_dir) for ind, (chip, window, labels) in enumerate(data): chip_path = join(img_dir, '{}-{}.png'.format(scene.id, ind)) label_path = join(labels_dir, '{}-{}.png'.format(scene.id, ind)) save_img(chip, chip_path) label_im = labels.get_label_arr(window).astype(np.uint8) save_img(label_im, label_path) return scene_dir
def process_scene_data(self, scene: Scene, data: TrainingData, tmp_dir: str) -> str: """Process the given scene and data into a TFRecord file specifically associated with that file. Args: scene: The scene data (labels stores, the raster sources, and so on). data: The training data. tmp_dir: (str) temporary directory to use Returns: The local path to the generated file. """ # Currently TF Deeplab can only handle uint8 if scene.raster_source.get_dtype() != np.uint8: raise Exception('Cannot use {} backend for imagery that does ' 'not have data type uint8. ' 'Use the StatsAnalyzer and StatsTransformer ' 'to turn the raster data into uint8 data'.format( rv.TF_DEEPLAB)) tf_examples = make_tf_examples(data, self.class_map) base_uri = self.backend_config.training_data_uri split = '{}-{}'.format(scene.id, uuid.uuid4()) record_path = join(base_uri, '{}.record'.format(split)) record_path = get_local_path(record_path, tmp_dir) make_dir(record_path, use_dirname=True) write_tf_record(tf_examples, record_path) return record_path
def process_sceneset_results(self, training_results: List[str], validation_results: List[str], tmp_dir: str) -> None: """Merge TFRecord files from individual scenes into two at-large files (one for training data and one for validation data). Args: training_results: A list of paths to TFRecords containing training data. validation_results: A list of paths to TFRecords containing validation data. tmp_dir: (str) temporary directory to use Returns: None """ base_uri = self.backend_config.training_data_uri chip_suffix = str(uuid.uuid4()).split('-')[0] training_record_path = get_record_uri(base_uri, TRAIN, chip_suffix) training_record_path_local = get_local_path(training_record_path, tmp_dir) validation_record_path = get_record_uri(base_uri, VALIDATION, chip_suffix) validation_record_path_local = get_local_path(validation_record_path, tmp_dir) make_dir(training_record_path_local, use_dirname=True) make_dir(validation_record_path_local, use_dirname=True) # sic merge_tf_records(training_record_path_local, training_results) merge_tf_records(validation_record_path_local, validation_results) upload_or_copy(training_record_path_local, training_record_path) upload_or_copy(validation_record_path_local, validation_record_path) if self.backend_config.debug: training_zip_path = join(base_uri, '{}'.format(TRAIN)) training_zip_path_local = get_local_path(training_zip_path, tmp_dir) validation_zip_path = join(base_uri, '{}'.format(VALIDATION)) validation_zip_path_local = get_local_path(validation_zip_path, tmp_dir) training_debug_dir = join(tmp_dir, 'training-debug') make_debug_images( training_record_path_local, training_debug_dir, self.class_map, self.task_config.chip_options.debug_chip_probability) shutil.make_archive(training_zip_path_local, 'zip', training_debug_dir) validation_debug_dir = join(tmp_dir, 'validation-debug') make_debug_images( validation_record_path_local, validation_debug_dir, self.class_map, self.task_config.chip_options.debug_chip_probability) shutil.make_archive(validation_zip_path_local, 'zip', validation_debug_dir) upload_or_copy('{}.zip'.format(training_zip_path_local), '{}.zip'.format(training_zip_path)) upload_or_copy('{}.zip'.format(validation_zip_path_local), '{}.zip'.format(validation_zip_path))
def process_scene_data(self, scene, data, tmp_dir): """Make training chips for a scene. This writes a set of image chips to {scene_id}/{class_name}/{scene_id}-{ind}.png Args: scene: (rv.data.Scene) data: (rv.data.Dataset) tmp_dir: (str) path to temp directory Returns: (str) path to directory with scene chips {tmp_dir}/{scene_id} """ scene_dir = join(tmp_dir, str(scene.id)) for ind, (chip, window, labels) in enumerate(data): class_id = labels.get_cell_class_id(window) # If a chip is not associated with a class, don't # use it in training data. if class_id is None: continue class_name = self.task_config.class_map.get_by_id(class_id).name class_dir = join(scene_dir, class_name) make_dir(class_dir) chip_path = join(class_dir, '{}-{}.png'.format(scene.id, ind)) save_img(chip, chip_path) return scene_dir
def process_scene_data(self, scene, data, tmp_dir): """Process each scene's training data Args: scene: Scene data: TrainingData Returns: dictionary of Scene's classes and corresponding local directory path """ dataset_files = DatasetFiles(self.config.training_data_uri, tmp_dir) scratch_dir = dataset_files.get_local_path(dataset_files.scratch_uri) scene_dir = join(scratch_dir, '{}-{}'.format(scene.id, uuid.uuid4())) class_dirs = {} for chip_idx, (chip, window, labels) in enumerate(data): class_id = labels.get_cell_class_id(window) if class_id is None: continue class_name = self.class_map.get_by_id(class_id).name class_dir = join(scene_dir, class_name) make_dir(class_dir) class_dirs[class_name] = class_dir chip_name = '{}.png'.format(chip_idx) chip_path = join(class_dir, chip_name) save_img(chip, chip_path) return class_dirs
def _make_debug_chips(split): debug_chips_dir = join(tmp_dir, '{}-debug-chips'.format(split)) zip_path = join(tmp_dir, '{}-debug-chips.zip'.format(split)) zip_uri = join(train_uri, '{}-debug-chips.zip'.format(split)) make_dir(debug_chips_dir) dl = data.train_dl if split == 'train' else data.valid_dl i = 0 for _, (x_batch, y_batch) in enumerate(dl): for x, y in zip(x_batch, y_batch): x = x.squeeze() y = y.squeeze() # fastai has an x.show(y=y) method, but we need to plot the # debug chips ourselves in order to use # a custom color map that matches the colors in the class_map. # This could be a good things to contribute upstream to fastai. plt.axis('off') plt.imshow(x.data.permute((1, 2, 0)).numpy()) plt.imshow(y.data.squeeze().numpy(), alpha=0.4, vmin=0, vmax=len(colors), cmap=cmap) plt.savefig(join(debug_chips_dir, '{}.png'.format(i)), figsize=(3, 3)) plt.close() i += 1 if i > max_count: break if i > max_count: break zipdir(debug_chips_dir, zip_path) upload_or_copy(zip_path, zip_uri)
def process_scene_data(self, scene, data, tmp_dir): """Make training chips for a scene. This writes a set of image chips to {scene_id}/img/{scene_id}-{ind}.png and corresponding label chips to {scene_id}/labels/{scene_id}-{ind}.png. Args: scene: (rv.data.Scene) data: (rv.data.Dataset) tmp_dir: (str) path to temp directory Returns: (str) path to directory with scene chips {tmp_dir}/{scene_id} """ scene_dir = join(tmp_dir, str(scene.id)) img_dir = join(scene_dir, 'img') labels_dir = join(scene_dir, 'labels') make_dir(img_dir) make_dir(labels_dir) for ind, (chip, window, labels) in enumerate(data): chip_path = join(img_dir, '{}-{}.png'.format(scene.id, ind)) label_path = join(labels_dir, '{}-{}.png'.format(scene.id, ind)) label_im = labels.get_label_arr(window).astype(np.uint8) save_img(label_im, label_path) save_img(chip, chip_path) return scene_dir
def _copy_train_chips(img_or_labels): all_uri = join(chip_dir, 'train-{}'.format(img_or_labels)) sample_dir = 'train-{}-{}'.format(str(sample_size), img_or_labels) sample_dir_uri = join(chip_dir, sample_dir) make_dir(sample_dir_uri) for s in sample_images: upload_or_copy(join(all_uri, s), join(sample_dir_uri, s)) return sample_dir
def test_check_empty(self): path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, check_empty=False) with self.assertRaises(Exception): make_dir(dir, check_empty=True)
def save_model_bundle(self): model_bundle_dir = join(self.tmp_dir, 'model-bundle') make_dir(model_bundle_dir) shutil.copyfile(self.last_model_path, join(model_bundle_dir, 'model.pth')) shutil.copyfile(self.config_path, join(model_bundle_dir, 'config.json')) zipdir(model_bundle_dir, self.model_bundle_path)
def test_file_exists_local_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertTrue(file_exists(path))
def process_scene_data(self, scene, data, tmp_dir): """Process each scene's training data. This writes {scene_id}/{scene_id}-{ind}.png and {scene_id}/{scene_id}-labels.json in COCO format. Args: scene: Scene data: TrainingData Returns: backend-specific data-structures consumed by backend's process_sceneset_results """ scene_dir = join(tmp_dir, str(scene.id)) labels_path = join(scene_dir, '{}-labels.json'.format(scene.id)) make_dir(scene_dir) images = [] annotations = [] categories = [{ 'id': item.id, 'name': item.name } for item in self.task_config.class_map.get_items()] for im_ind, (chip, window, labels) in enumerate(data): im_id = '{}-{}'.format(scene.id, im_ind) fn = '{}.png'.format(im_id) chip_path = join(scene_dir, fn) save_img(chip, chip_path) images.append({ 'file_name': fn, 'id': im_id, 'height': chip.shape[0], 'width': chip.shape[1] }) npboxes = labels.get_npboxes() npboxes = ObjectDetectionLabels.global_to_local(npboxes, window) for box_ind, (box, class_id) in enumerate( zip(npboxes, labels.get_class_ids())): bbox = [box[1], box[0], box[3] - box[1], box[2] - box[0]] bbox = [int(i) for i in bbox] annotations.append({ 'id': '{}-{}'.format(im_id, box_ind), 'image_id': im_id, 'bbox': bbox, 'category_id': int(class_id) }) coco_dict = { 'images': images, 'annotations': annotations, 'categories': categories } json_to_file(coco_dict, labels_path) return scene_dir
def save_debug_predict_image(self, scene, debug_dir_uri): img = draw_debug_predict_image(scene, self.config.class_map) # Saving to a jpg leads to segfault for unknown reasons. debug_image_uri = join(debug_dir_uri, scene.id + '.png') with RVConfig.get_tmp_dir() as temp_dir: debug_image_path = get_local_path(debug_image_uri, temp_dir) make_dir(debug_image_path, use_dirname=True) img.save(debug_image_path) upload_or_copy(debug_image_path, debug_image_uri)
def _make_debug_chips(split): debug_chips_dir = join(train_uri, '{}-debug-chips'.format(split)) make_dir(debug_chips_dir) ds = data.train_ds if split == 'train' else data.valid_ds for i, (x, y) in enumerate(ds): x.show(y=y) plt.savefig(join(debug_chips_dir, '{}.png'.format(i))) plt.close()
def test_force_empty(self): path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, force_empty=False) self.assertTrue(os.path.isfile(path)) make_dir(dir, force_empty=True) is_empty = len(os.listdir(dir)) == 0 self.assertTrue(is_empty)
def test_sync_from_dir_noop_local(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') src = os.path.join(self.temp_dir.name, 'lorem') make_dir(src, check_empty=False) fs = FileSystem.get_file_system(src, 'r') fs.write_bytes(path, bytes([0x00, 0x01])) sync_from_dir(src, src, delete=True) self.assertEqual(len(list_paths(src)), 1)
def test_copy_to_http(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') dst = 'http://localhost/' directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertRaises(NotWritableError, lambda: upload_or_copy(path, dst)) os.remove(path)
def test_file_exists_s3_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) s3_path = 's3://{}/lorem.txt'.format(self.bucket_name) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_path))
def test_last_modified(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum1.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) fs = FileSystem.get_file_system(path, 'r') str_to_file(self.lorem, path) stamp = fs.last_modified(path) self.assertTrue(isinstance(stamp, datetime.datetime))
def test_bundle_od_command(self): def get_task(tmp_dir): predict_package_uri = os.path.join(tmp_dir, 'predict_package.zip') t = rv.TaskConfig.builder(rv.OBJECT_DETECTION) \ .with_predict_package_uri(predict_package_uri) \ .with_classes(['class1']) \ .build() return t def get_backend(task, tmp_dir): model_uri = os.path.join(tmp_dir, 'model') template_uri = data_file_path( 'tf_object_detection/embedded_ssd_mobilenet_v1_coco.config') with open(model_uri, 'w') as f: f.write('DUMMY') b = rv.BackendConfig.builder(rv.TF_OBJECT_DETECTION) \ .with_task(task) \ .with_template(template_uri) \ .with_model_uri(model_uri) \ .build() return b with RVConfig.get_tmp_dir() as tmp_dir: task = get_task(tmp_dir) backend = get_backend(task, tmp_dir) analyzer = self.get_analyzer(tmp_dir) scene = self.get_scene(tmp_dir) cmd = rv.CommandConfig.builder(rv.BUNDLE) \ .with_task(task) \ .with_root_uri(tmp_dir) \ .with_backend(backend) \ .with_analyzers([analyzer]) \ .with_scene(scene) \ .build() \ .create_command() cmd.run(tmp_dir) package_dir = os.path.join(tmp_dir, 'package') make_dir(package_dir) with zipfile.ZipFile(task.predict_package_uri, 'r') as package_zip: package_zip.extractall(path=package_dir) bundle_config_path = os.path.join(package_dir, 'bundle_config.json') bundle_config = load_json_config(bundle_config_path, CommandConfigMsg()) self.assertEqual(bundle_config.command_type, rv.BUNDLE) actual = set(os.listdir(package_dir)) expected = set(['stats.json', 'model', 'bundle_config.json']) self.assertEqual(actual, expected)
def run(self, tmp_dir=None): if not tmp_dir: tmp_dir = self.get_tmp_dir() cc = self.command_config if not cc.task.predict_package_uri: msg = 'Skipping bundling of prediction package, no URI is set...'.format( cc.task.predict_package_uri) click.echo(click.style(msg, fg='yellow')) return msg = 'Bundling prediction package to {}...'.format( cc.task.predict_package_uri) log.info(msg) bundle_dir = os.path.join(tmp_dir, 'bundle') make_dir(bundle_dir) package_path = os.path.join(tmp_dir, 'predict_package.zip') bundle_files = [] new_task, task_files = cc.task.save_bundle_files(bundle_dir) bundle_files.extend(task_files) new_backend, backend_files = cc.backend.save_bundle_files(bundle_dir) bundle_files.extend(backend_files) new_scene, scene_files = cc.scene.save_bundle_files(bundle_dir) bundle_files.extend(scene_files) new_analyzers = [] for analyzer in cc.analyzers: new_analyzer, analyzer_files = analyzer.save_bundle_files( bundle_dir) new_analyzers.append(new_analyzer) bundle_files.extend(analyzer_files) new_bundle_config = cc.to_builder() \ .with_task(new_task) \ .with_backend(new_backend) \ .with_scene(new_scene) \ .with_analyzers(new_analyzers) \ .build() # Save bundle command config bundle_config_path = os.path.join(tmp_dir, 'bundle_config.json') bundle_json = json_format.MessageToJson(new_bundle_config.to_proto()) with open(bundle_config_path, 'w') as f: f.write(bundle_json) with zipfile.ZipFile(package_path, 'w') as package_zip: for path in bundle_files: package_zip.write(path, arcname=os.path.basename(path)) package_zip.write(bundle_config_path, arcname=os.path.basename(bundle_config_path)) upload_or_copy(package_path, cc.task.predict_package_uri)
def test_list_paths_s3(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) list_paths(s3_directory) self.assertEqual(len(list_paths(s3_directory)), 1)
def test_bytes_local(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) expected = bytes([0x00, 0x01, 0x02]) fs = FileSystem.get_file_system(path, 'r') fs.write_bytes(path, expected) actual = fs.read_bytes(path) self.assertEqual(actual, expected)
def test_sync_to_dir_local(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') src = os.path.dirname(path) dst = os.path.join(self.temp_dir.name, 'xxx') make_dir(src, check_empty=False) make_dir(dst, check_empty=False) fs = FileSystem.get_file_system(path, 'r') fs.write_bytes(path, bytes([0x00, 0x01])) sync_to_dir(src, dst, delete=True) self.assertEqual(len(list_paths(dst)), 1)
def test_copy_to_local(self): path1 = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') path2 = os.path.join(self.temp_dir.name, 'yyy', 'ipsum.txt') dir1 = os.path.dirname(path1) dir2 = os.path.dirname(path2) make_dir(dir1, check_empty=False) make_dir(dir2, check_empty=False) str_to_file(self.lorem, path1) upload_or_copy(path1, path2) self.assertEqual(len(list_paths(dir2)), 1)