def run(self, cfg_json_uri, pipeline, commands, num_splits=1, pipeline_run_name: str = 'raster-vision'): num_commands = 0 for command in commands: if command in pipeline.split_commands and num_splits > 1: num_commands += num_splits else: num_commands += 1 makefile = '.PHONY: ' makefile += ' '.join([str(ci) for ci in range(num_commands)]) makefile += '\n\n' makefile += 'all: ' makefile += ' '.join([str(ci) for ci in range(num_commands)]) makefile += '\n\n' prev_command_inds = [] curr_command_ind = 0 for command in commands: curr_command_inds = [] if command in pipeline.split_commands and num_splits > 1: for split_ind in range(num_splits): makefile += '{}: '.format(curr_command_ind) makefile += ' '.join([str(ci) for ci in prev_command_inds]) makefile += '\n' invocation = ( 'python -m rastervision.pipeline.cli run_command ' '{} {} --split-ind {} --num-splits {}').format( cfg_json_uri, command, split_ind, num_splits) makefile += '\t{}\n\n'.format(invocation) curr_command_inds.append(curr_command_ind) curr_command_ind += 1 else: makefile += '{}: '.format(curr_command_ind) makefile += ' '.join([str(ci) for ci in prev_command_inds]) makefile += '\n' invocation = ( 'python -m rastervision.pipeline.cli run_command ' '{} {}'.format(cfg_json_uri, command)) makefile += '\t{}\n\n'.format(invocation) curr_command_inds.append(curr_command_ind) curr_command_ind += 1 prev_command_inds = curr_command_inds makefile_path = join(dirname(cfg_json_uri), 'Makefile') str_to_file(makefile, makefile_path) process = Popen(['make', '-j', '-f', makefile_path]) terminate_at_exit(process) exitcode = process.wait() if exitcode != 0: sys.exit(exitcode) else: return 0
def setup_training(self, loss_def_path=None): log.info(self.cfg) log.info(f'Using device: {self.device}') # ds = dataset, dl = dataloader self.train_ds = None self.train_dl = None self.valid_ds = None self.valid_dl = None self.test_ds = None self.test_dl = None self.config_path = join(self.output_dir, 'learner-config.json') str_to_file(self.cfg.json(), self.config_path) self.log_path = join(self.output_dir, 'log.csv') self.train_state_path = join(self.output_dir, 'train-state.json') model_bundle_fname = basename(self.cfg.get_model_bundle_uri()) self.model_bundle_path = join(self.output_dir, model_bundle_fname) self.metric_names = self.build_metric_names() self.last_model_path = join(self.output_dir, 'last-model.pth') self.load_checkpoint() self.setup_loss(loss_def_path=loss_def_path) self.opt = self.build_optimizer() self.setup_data() self.start_epoch = self.get_start_epoch() self.steps_per_epoch = len(self.train_ds) // self.cfg.solver.batch_sz self.step_scheduler = self.build_step_scheduler() self.epoch_scheduler = self.build_epoch_scheduler() self.setup_tensorboard()
def test_file_exists_local_true(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertTrue(file_exists(path))
def test_check_empty(self): path = os.path.join(self.tmp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, check_empty=False) with self.assertRaises(Exception): make_dir(dir, check_empty=True)
def test_download_if_needed_local(self): with self.assertRaises(NotReadableError): file_to_str(self.local_path) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.local_path) local_path = download_if_needed(self.local_path, self.tmp_dir.name) self.assertEqual(local_path, self.local_path)
def save(self, output_uri): """Save this Evaluation to a file. Args: output_uri: string URI for the file to write. """ json_str = json.dumps(self.to_json(), indent=4) str_to_file(json_str, output_uri)
def test_file_to_str_local(self): str_to_file(self.content_str, self.local_path) content_str = file_to_str(self.local_path) self.assertEqual(self.content_str, content_str) wrong_path = '/wrongpath/x.txt' with self.assertRaises(NotReadableError): file_to_str(wrong_path)
def save_pipeline_config(cfg: 'PipelineConfig', output_uri: str): """Save a PipelineConfig to JSON file. Inject rv_config and plugin_versions before saving. """ cfg.rv_config = rv_config.get_config_dict(registry.rv_config_schema) cfg.plugin_versions = registry.plugin_versions cfg_json = cfg.json() str_to_file(cfg_json, output_uri)
def test_force_empty(self): path = os.path.join(self.tmp_dir.name, 'hello', 'hello.txt') dir = os.path.dirname(path) str_to_file('hello', path) make_dir(dir, force_empty=False) self.assertTrue(os.path.isfile(path)) make_dir(dir, force_empty=True) is_empty = len(os.listdir(dir)) == 0 self.assertTrue(is_empty)
def test_copy_to_http(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') dst = 'http://localhost/' directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertRaises(NotWritableError, lambda: upload_or_copy(path, dst)) os.remove(path)
def test_last_modified(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum1.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) fs = FileSystem.get_file_system(path, 'r') str_to_file(self.lorem, path) stamp = fs.last_modified(path) self.assertTrue(isinstance(stamp, datetime.datetime))
def test_file_exists_s3_true(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) s3_path = 's3://{}/lorem.txt'.format(self.bucket_name) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_path))
def test_remote(self): with patch( 'rastervision.pipeline.file_system.utils.download_if_needed', side_effect=download_if_needed) as patched_download: s3_path = 's3://{}/{}'.format(self.bucket_name, self.file_name) str_to_file(self.content_str, s3_path) path = get_cached_file(self.cache_dir, s3_path) self.assertTrue(os.path.isfile(path)) # Check that calling it again doesn't invoke the download method again. self.assertTrue(os.path.isfile(path)) self.assertEqual(patched_download.call_count, 1)
def test_copy_to_local(self): path1 = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') path2 = os.path.join(self.tmp_dir.name, 'yyy', 'ipsum.txt') dir1 = os.path.dirname(path1) dir2 = os.path.dirname(path2) make_dir(dir1, check_empty=False) make_dir(dir2, check_empty=False) str_to_file(self.lorem, path1) upload_or_copy(path1, path2) self.assertEqual(len(list_paths(dir2)), 1)
def test_list_paths_s3(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) list_paths(s3_directory) self.assertEqual(len(list_paths(s3_directory)), 1)
def save_messages(self, split_ind=0, num_splits=1): message_maker = self.config.message_maker.build() split_groups = split_into_groups( list(zip(self.config.names, self.config.message_uris)), num_splits) split_group = split_groups[split_ind] for name, message_uri in split_group: # Unlike before, we use the message_maker to make the message. message = message_maker.make_message(name) str_to_file(message, message_uri) print('Saved message to {}'.format(message_uri))
def test_file_to_str_s3(self): wrong_path = 's3://wrongpath/x.txt' with self.assertRaises(NotWritableError): str_to_file(self.content_str, wrong_path) str_to_file(self.content_str, self.s3_path) content_str = file_to_str(self.s3_path) self.assertEqual(self.content_str, content_str) with self.assertRaises(NotReadableError): file_to_str(wrong_path)
def test_last_modified_s3(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum1.txt') s3_path = 's3://{}/lorem1.txt'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) fs = FileSystem.get_file_system(s3_path, 'r') str_to_file(self.lorem, path) upload_or_copy(path, s3_path) stamp = fs.last_modified(s3_path) self.assertTrue(isinstance(stamp, datetime.datetime))
def test_download_if_needed_s3(self): with self.assertRaises(NotReadableError): file_to_str(self.s3_path) str_to_file(self.content_str, self.local_path) upload_or_copy(self.local_path, self.s3_path) local_path = download_if_needed(self.s3_path, self.tmp_dir.name) content_str = file_to_str(local_path) self.assertEqual(self.content_str, content_str) wrong_path = 's3://wrongpath/x.txt' with self.assertRaises(NotWritableError): upload_or_copy(local_path, wrong_path)
def test_file_exists(self): fs = FileSystem.get_file_system(self.tmp_dir.name, 'r') path1 = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') dir1 = os.path.dirname(path1) make_dir(dir1, check_empty=False) str_to_file(self.lorem, path1) self.assertTrue(fs.file_exists(dir1, include_dir=True)) self.assertTrue(fs.file_exists(path1, include_dir=False)) self.assertFalse(fs.file_exists(dir1, include_dir=False)) self.assertFalse( fs.file_exists(dir1 + 'NOTPOSSIBLE', include_dir=False))
def test_file_exists(self): path = os.path.join(self.tmp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_path_prefix = 's3://{}/xxx/lorem'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) make_dir(path, check_empty=False, use_dirname=True) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_directory, include_dir=True)) self.assertTrue(file_exists(s3_path, include_dir=False)) self.assertFalse(file_exists(s3_path_prefix, include_dir=True)) self.assertFalse(file_exists(s3_directory, include_dir=False)) self.assertFalse( file_exists(s3_directory + 'NOTPOSSIBLE', include_dir=False))
def save_messages(self, split_ind=0, num_splits=1): # Save a file for each name with a message. # The num_splits is the number of parallel jobs to use and # split_ind tracks the index of the parallel job. In this case # we are splitting on the names/message_uris. split_groups = split_into_groups( list(zip(self.config.names, self.config.message_uris)), num_splits) split_group = split_groups[split_ind] for name, message_uri in split_group: message = 'hello {}!'.format(name) # str_to_file and most functions in the file_system package can # read and write transparently to different file systems based on # the URI pattern. str_to_file(message, message_uri) print('Saved message to {}'.format(message_uri))
def filter_geojson(labels_uri, output_uri, class_names): """Remove features that aren't in class_names and remove class_ids.""" labels_str = file_to_str(labels_uri) labels = json.loads(labels_str) filtered_features = [] for feature in labels['features']: feature = copy.deepcopy(feature) properties = feature.get('properties') if properties: class_name = properties.get('class_name') or properties('label') if class_name in class_names: del properties['class_id'] filtered_features.append(feature) new_labels = {'features': filtered_features} str_to_file(json.dumps(new_labels), output_uri)
def write_vector_outputs(self, labels: SemanticSegmentationLabels) -> None: """Write vectorized outputs for all configs in self.vector_outputs.""" import mask_to_polygons.vectorification as vectorification import mask_to_polygons.processing.denoise as denoise log.info('Writing vector output to disk.') label_arr = self._labels_to_full_label_arr(labels) with click.progressbar(self.vector_outputs) as bar: for i, vo in enumerate(bar): if vo.uri is None: log.info(f'Skipping VectorOutputConfig at index {i} ' 'due to missing uri.') continue uri = get_local_path(vo.uri, self.tmp_dir) denoise_radius = vo.denoise mode = vo.get_mode() class_mask = (label_arr == vo.class_id).astype(np.uint8) def transform(x, y): return self.crs_transformer.pixel_to_map((x, y)) if denoise_radius > 0: class_mask = denoise.denoise(class_mask, denoise_radius) if mode == 'buildings': geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode, min_aspect_ratio=vo.min_aspect_ratio, min_area=vo.min_area, width_factor=vo.element_width_factor, thickness=vo.element_thickness) elif mode == 'polygons': geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode) str_to_file(geojson, uri) upload_or_copy(uri, vo.uri)
def run(self, cfg_json_uri, pipeline, commands, num_splits=1, pipeline_run_name: str = 'raster-vision'): num_commands = 0 for command in commands: if command in pipeline.split_commands and num_splits > 1: num_commands += num_splits else: num_commands += 1 makefile = '.PHONY: ' makefile += ' '.join([str(ci) for ci in range(num_commands)]) makefile += '\n\n' makefile += 'all: ' makefile += ' '.join([str(ci) for ci in range(num_commands)]) makefile += '\n\n' prev_command_inds = [] curr_command_ind = 0 for command in commands: # detect external command if hasattr(pipeline, command): fn = getattr(pipeline, command) params = signature(fn).parameters external = hasattr(fn, 'external') and len(params) in {0, 1} else: external = False curr_command_inds = [] if not external: if command in pipeline.split_commands and num_splits > 1: for split_ind in range(num_splits): makefile += '{}: '.format(curr_command_ind) makefile += ' '.join( [str(ci) for ci in prev_command_inds]) makefile += '\n' invocation = ( 'python -m rastervision.pipeline.cli run_command ' '{} {} --split-ind {} --num-splits {}').format( cfg_json_uri, command, split_ind, num_splits) makefile += '\t{}\n\n'.format(invocation) curr_command_inds.append(curr_command_ind) curr_command_ind += 1 else: makefile += '{}: '.format(curr_command_ind) makefile += ' '.join([str(ci) for ci in prev_command_inds]) makefile += '\n' invocation = ( 'python -m rastervision.pipeline.cli run_command ' '{} {}'.format(cfg_json_uri, command)) makefile += '\t{}\n\n'.format(invocation) curr_command_inds.append(curr_command_ind) curr_command_ind += 1 else: if len(params) == 0: # No-parameter external command cmds = [fn()] elif len(params) == 1 and command in pipeline.split_commands: # One-paramater split external command cmds = fn(num_splits) elif len(params ) == 1 and command not in pipeline.split_commands: # One-paramater unsplit external command cmds = fn(1) else: # No command cmds = [] for cmd in cmds: makefile += '{}: '.format(curr_command_ind) makefile += ' '.join([str(ci) for ci in prev_command_inds]) makefile += '\n' invocation = (' '.join(cmd)) makefile += '\t{}\n\n'.format(invocation) curr_command_inds.append(curr_command_ind) curr_command_ind += 1 prev_command_inds = curr_command_inds makefile_path = join(dirname(cfg_json_uri), 'Makefile') str_to_file(makefile, makefile_path) process = Popen(['make', '-j', '-f', makefile_path]) terminate_at_exit(process) exitcode = process.wait() if exitcode != 0: sys.exit(exitcode) else: return 0
def save(self, labels): """Save. Args: labels - (SemanticSegmentationLabels) labels to be saved """ local_path = get_local_path(self.uri, self.tmp_dir) make_dir(local_path, use_dirname=True) transform = self.crs_transformer.get_affine_transform() crs = self.crs_transformer.get_image_crs() band_count = 1 dtype = np.uint8 if self.class_trans: band_count = 3 mask = (np.zeros((self.extent.ymax, self.extent.xmax), dtype=np.uint8) if self.vector_output else None) # https://github.com/mapbox/rasterio/blob/master/docs/quickstart.rst # https://rasterio.readthedocs.io/en/latest/topics/windowed-rw.html with rasterio.open(local_path, 'w', driver='GTiff', height=self.extent.ymax, width=self.extent.xmax, count=band_count, dtype=dtype, transform=transform, crs=crs) as dataset: for window in labels.get_windows(): label_arr = labels.get_label_arr(window) window = window.intersection(self.extent) label_arr = label_arr[0:window.get_height(), 0:window.get_width()] if mask is not None: mask[window.ymin:window.ymax, window.xmin:window.xmax] = label_arr window = window.rasterio_format() if self.class_trans: rgb_labels = self.class_trans.class_to_rgb(label_arr) for chan in range(3): dataset.write_band(chan + 1, rgb_labels[:, :, chan], window=window) else: img = label_arr.astype(dtype) dataset.write_band(1, img, window=window) upload_or_copy(local_path, self.uri) if self.vector_output: import mask_to_polygons.vectorification as vectorification import mask_to_polygons.processing.denoise as denoise for vo in self.vector_output: denoise_radius = vo.denoise uri = vo.uri mode = vo.get_mode() class_id = vo.class_id class_mask = np.array(mask == class_id, dtype=np.uint8) def transform(x, y): return self.crs_transformer.pixel_to_map((x, y)) if denoise_radius > 0: class_mask = denoise.denoise(class_mask, denoise_radius) if uri and mode == 'buildings': geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode, min_aspect_ratio=vo.min_aspect_ratio, min_area=vo.min_area, width_factor=vo.element_width_factor, thickness=vo.element_thickness) elif uri and mode == 'polygons': geojson = vectorification.geojson_from_mask( mask=class_mask, transform=transform, mode=mode) str_to_file(geojson, uri)
def save(self, stats_uri): # Ensure lists means = list(self.means) stds = list(self.stds) stats = {'means': means, 'stds': stds} str_to_file(json.dumps(stats), stats_uri)
def __init__(self, cfg: LearnerConfig, tmp_dir: str, model_path: Optional[str] = None, model_def_path: Optional[str] = None, loss_def_path: Optional[str] = None): """Constructor. Args: cfg: configuration tmp_dir: root of temp dirs model_path: a local path to model weights. If provided, the model is loaded and it is assumed that this Learner will be used for prediction only. model_def_path: a local path to a directory with a hubconf.py. If provided, the model definition is imported from here. loss_def_path: a local path to a directory with a hubconf.py. If provided, the loss function definition is imported from here. """ self.cfg = cfg self.tmp_dir = tmp_dir # TODO make cache dirs configurable torch_cache_dir = '/opt/data/torch-cache' os.environ['TORCH_HOME'] = torch_cache_dir self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.data_cache_dir = '/opt/data/data-cache' make_dir(self.data_cache_dir) if FileSystem.get_file_system(cfg.output_uri) == LocalFileSystem: self.output_dir = cfg.output_uri make_dir(self.output_dir) else: self.output_dir = get_local_path(cfg.output_uri, tmp_dir) make_dir(self.output_dir, force_empty=True) if not cfg.overfit_mode: self.sync_from_cloud() self.modules_dir = join(self.output_dir, MODULES_DIRNAME) self.setup_model(model_def_path=model_def_path) if model_path is not None: if isfile(model_path): self.model.load_state_dict( torch.load(model_path, map_location=self.device)) else: raise Exception( 'Model could not be found at {}'.format(model_path)) self.model.eval() else: log.info(self.cfg) # ds = dataset, dl = dataloader self.train_ds = None self.train_dl = None self.valid_ds = None self.valid_dl = None self.test_ds = None self.test_dl = None self.config_path = join(self.output_dir, 'learner-config.json') str_to_file(self.cfg.json(), self.config_path) self.log_path = join(self.output_dir, 'log.csv') self.train_state_path = join(self.output_dir, 'train-state.json') model_bundle_fname = basename(cfg.get_model_bundle_uri()) self.model_bundle_path = join(self.output_dir, model_bundle_fname) self.metric_names = self.build_metric_names() self.last_model_path = join(self.output_dir, 'last-model.pth') self.load_checkpoint() self.setup_loss(loss_def_path=loss_def_path) self.opt = self.build_optimizer() self.setup_data() self.start_epoch = self.get_start_epoch() self.steps_per_epoch = len( self.train_ds) // self.cfg.solver.batch_sz self.step_scheduler = self.build_step_scheduler() self.epoch_scheduler = self.build_epoch_scheduler() self.setup_tensorboard()
def test_write_str_http(self): self.assertRaises(NotWritableError, lambda: str_to_file('xxx', 'http://localhost/'))
def test_local(self): local_path = os.path.join(self.tmp_dir.name, self.file_name) str_to_file(self.content_str, local_path) path = get_cached_file(self.cache_dir, local_path) self.assertTrue(os.path.isfile(path))