def test_file_exists(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name) s3_directory = 's3://{}/xxx/'.format(self.bucket_name) directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_directory, include_dir=True)) self.assertTrue(file_exists(s3_path, include_dir=False)) self.assertFalse(file_exists(s3_directory, include_dir=False)) self.assertFalse( file_exists(s3_directory + 'NOTPOSSIBLE', include_dir=False))
def test_file_exists_local_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) self.assertTrue(file_exists(path))
def make_train_scenes(item): # We can easily construct the uri using the root directory of the training STAC. # Each rv scene will consist of one portion of an image created in the preprocessing # stage but it is not necessary to split the labels up in the same way. We can # pass a scene's label uri to all of it's child images and rv will subset the # labels automatically. area = item.get_parent().id label_uri = join(dirname(train_stac_uri), area, '{}-labels'.format(item.id), '{}.geojson'.format(item.id)) # If you preprocessed the imagery using the 'PREPROCESS' aux command, the # image splits will have integer suffixes that ascend from 0 until the image # has been completely covered. To make sure we get all of the splits we will # incrementally create new uri's until one is found to not exist. i = 0 images_remaining = True scenes = [] while images_remaining: raster_uri = join(train_img_dir, area, item.id, '{}_{}.tif'.format(item.id, i)) if file_exists(raster_uri): # construct a raster source (i.e. the image) raster_source = rv.RasterSourceConfig.builder(rv.RASTERIO_SOURCE) \ .with_uri(raster_uri) \ .with_channel_order([0, 1, 2]) \ .build() # construct a label source (i.e. the scene's geojson labels) # The with_rasterizer_options method sets the default pixel value to use # for background pixels in prediction. The value of 0 is reserved for nodata # pixels in rv so we will need to add a postprocessing step at the end # in order for the test set predicitons to match the competition submission # suidelines. label_raster_source = rv.RasterSourceConfig.builder(rv.RASTERIZED_SOURCE) \ .with_vector_source(label_uri) \ .with_rasterizer_options(2) \ .build() label_source = rv.LabelSourceConfig.builder(rv.SEMANTIC_SEGMENTATION) \ .with_raster_source(label_raster_source) \ .build() # Build scene config scene = rv.SceneConfig.builder() \ .with_task(task) \ .with_id('{}_{}'.format(item.id, i)) \ .with_raster_source(raster_source) \ .with_label_source(label_source) \ .build() scenes.append(scene) else: images_remaining = False i += 1 return scenes
def test_file_exists_s3_true(self): path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) str_to_file(self.lorem, path) s3_path = 's3://{}/lorem.txt'.format(self.bucket_name) upload_or_copy(path, s3_path) self.assertTrue(file_exists(s3_path))
def test_copy_from_http(self): http_path = ('https://raw.githubusercontent.com/tensorflow/models/' '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md') expected = os.path.join( self.temp_dir.name, 'http', 'raw.githubusercontent.com', 'tensorflow/models', '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md') download_if_needed(http_path, self.temp_dir.name) self.assertTrue(file_exists(expected)) os.remove(expected)
def make_train_scenes(item, train_stac_uri, train_img_dir, which="all"): area = item.get_parent().id label_uri = join( dirname(train_stac_uri), area, f"{item.id}-labels", f"{item.id}.geojson" ) scenes = [] if which == "all": i = 0 images_remaining = True while images_remaining: raster_uri = join(train_img_dir, area, item.id, f"{item.id}_{i}.tif") if file_exists(raster_uri): scenes.append(make_train_window(raster_uri, label_uri, item.id, i)) else: images_remaining = False i += 1 else: for i in which: raster_uri = join(train_img_dir, area, item.id, f"{item.id}_{i}.tif") scenes.append(make_train_window(raster_uri, label_uri, item.id, i)) return scenes
def __init__(self, uri, extent, crs_transformer, tmp_dir, vector_output=None, class_map=None): """Constructor. Args: uri: (str) URI of GeoTIFF file used for storing predictions as RGB values extent: (Box) The extent of the scene crs_transformer: (CRSTransformer) tmp_dir: (str) temp directory to use vector_output: (None or array of dicts) containing vectorifiction configuration information class_map: (ClassMap) with color values used to convert class ids to RGB values """ self.uri = uri self.vector_output = vector_output self.extent = extent self.crs_transformer = crs_transformer self.tmp_dir = tmp_dir # Note: can't name this class_transformer due to Python using that attribute if class_map: self.class_trans = SegmentationClassTransformer(class_map) else: self.class_trans = None self.source = None if file_exists(uri): self.source = rv.RasterSourceConfig.builder(rv.RASTERIO_SOURCE) \ .with_uri(self.uri) \ .build() \ .create_source(self.tmp_dir)
def save_image_crop(image_uri, image_crop_uri, label_uri=None, label_crop_uri=None, size=600, min_features=10, vector_labels=True): """Save a crop of an image to use for testing. If label_uri is set, the crop needs to cover >= min_features. Args: image_uri: URI of original image image_crop_uri: URI of cropped image to save label_uri: optional URI of label file label_crop_uri: optional URI of cropped labels to save size: height and width of crop Raises: ValueError if cannot find a crop satisfying min_features constraint. """ if not file_exists(image_crop_uri): print('Saving test crop to {}...'.format(image_crop_uri)) old_environ = os.environ.copy() try: request_payer = S3FileSystem.get_request_payer() if request_payer == 'requester': os.environ['AWS_REQUEST_PAYER'] = request_payer im_dataset = rasterio.open(image_uri) h, w = im_dataset.height, im_dataset.width extent = Box(0, 0, h, w) windows = extent.get_windows(size, size) if label_uri and vector_labels: crs_transformer = RasterioCRSTransformer.from_dataset( im_dataset) vs = GeoJSONVectorSource(label_uri, crs_transformer) geojson = vs.get_geojson() geoms = [] for f in geojson['features']: g = shape(f['geometry']) geoms.append(g) tree = STRtree(geoms) def p2m(x, y, z=None): return crs_transformer.pixel_to_map((x, y)) for w in windows: use_window = True if label_uri and vector_labels: w_polys = tree.query(w.to_shapely()) use_window = len(w_polys) >= min_features if use_window and label_crop_uri is not None: print('Saving test crop labels to {}...'.format( label_crop_uri)) label_crop_features = [ mapping(shapely.ops.transform(p2m, wp)) for wp in w_polys ] label_crop_json = { 'type': 'FeatureCollection', 'features': [{ 'geometry': f } for f in label_crop_features] } json_to_file(label_crop_json, label_crop_uri) if use_window: crop_image(image_uri, w, image_crop_uri) if not vector_labels and label_uri and label_crop_uri: crop_image(label_uri, w, label_crop_uri) break if not use_window: raise ValueError('Could not find a good crop.') finally: os.environ.clear() os.environ.update(old_environ)
def test_file_exists_s3_false(self): s3_path = 's3://{}/hello.txt'.format(self.bucket_name) self.assertFalse(file_exists(s3_path))
def test_file_exists_local_false(self): path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt') directory = os.path.dirname(path) make_dir(directory, check_empty=False) self.assertFalse(file_exists(path))
def test_file_exists_http_false(self): http_path = ('https://raw.githubusercontent.com/tensorflow/models/' '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/XXX') self.assertFalse(file_exists(http_path))
def save_image_crop(image_uri, crop_uri, label_uri=None, size=600, min_features=10): """Save a crop of an image to use for testing. If label_uri is set, the crop needs to cover >= min_features. Args: image_uri: URI of original image crop_uri: URI of cropped image to save label_uri: optional URI of GeoJSON file size: height and width of crop Raises: ValueError if cannot find a crop satisfying min_features constraint. """ if not file_exists(crop_uri): print('Saving test crop to {}...'.format(crop_uri)) old_environ = os.environ.copy() try: request_payer = S3FileSystem.get_request_payer() if request_payer == 'requester': os.environ['AWS_REQUEST_PAYER'] = request_payer im_dataset = rasterio.open(image_uri) h, w = im_dataset.height, im_dataset.width extent = Box(0, 0, h, w) windows = extent.get_windows(size, size) if label_uri is not None: crs_transformer = RasterioCRSTransformer.from_dataset( im_dataset) vs = GeoJSONVectorSource(label_uri, crs_transformer) geojson = vs.get_geojson() geoms = [] for f in geojson['features']: g = shape(f['geometry']) geoms.append(g) tree = STRtree(geoms) for w in windows: use_window = True if label_uri is not None: w_polys = tree.query(w.to_shapely()) use_window = len(w_polys) >= min_features if use_window: w = w.rasterio_format() im = im_dataset.read(window=w) if np.mean(np.sum(im, axis=2).ravel() == 0) < 0.9: with tempfile.TemporaryDirectory() as tmp_dir: crop_path = get_local_path(crop_uri, tmp_dir) make_dir(crop_path, use_dirname=True) meta = im_dataset.meta meta['width'], meta['height'] = size, size meta['transform'] = rasterio.windows.transform( w, im_dataset.transform) with rasterio.open(crop_path, 'w', **meta) as dst: dst.colorinterp = im_dataset.colorinterp dst.write(im) upload_or_copy(crop_path, crop_uri) break if not use_window: raise ValueError('Could not find a good crop.') finally: os.environ.clear() os.environ.update(old_environ)
def __init__(self, command_definitions, rerun_commands=False, skip_file_check=False): """Generates a CommandDAG from a list of CommandDefinitions This logic checks if there are any non-exsiting URIs that are not produced as outputs by some command in the set. If so, it raises a ConfigError stating the missing files. """ # Create a set of edges, from input_uri to command_config and # from command_config to output_uri. Nodes for commands are their # index into command_definitions. uri_dag = nx.DiGraph() log.debug('Creating command and URI DAG from command definitions...') for idx, command_def in enumerate(command_definitions): uri_dag.add_node(idx) for input_uri in command_def.io_def.input_uris: uri_dag.add_edge(input_uri, idx) for output_uri in command_def.io_def.output_uris: uri_dag.add_edge(idx, output_uri) # Find all source input_uris, and ensure they exist. if not skip_file_check: log.debug('Ensuring input files exist...') unsolved_sources = [ uri for uri in uri_dag.nodes if (type(uri) == str and len(uri_dag.in_edges(uri)) == 0) ] missing_files = [] with click.progressbar( unsolved_sources, label='Ensuring input files exist ') as uris: for uri in uris: if not file_exists(uri): missing_files.append(uri) if any(missing_files): raise rv.ConfigError( 'Files do not exist and are not supplied by commands:\n' '\t{}\n'.format(',\b\t'.join(missing_files))) # If we are not rerunning, remove commands that have existing outputs. self.skipped_commands = [] if not rerun_commands: log.debug('Checking for existing output...') commands_to_outputs = [(idx, edge[1]) for idx in uri_dag.nodes if type(idx) == int for edge in uri_dag.out_edges(idx)] with click.progressbar( commands_to_outputs, label='Checking for existing output') as lst: for idx, output_uri in lst: if file_exists(output_uri): uri_dag.remove_edge(idx, output_uri) for idx in set(map(lambda x: x[0], commands_to_outputs)): if len(uri_dag.out_edges(idx)) == 0: self.skipped_commands.append(command_definitions[idx]) uri_dag.remove_node(idx) # Collapse the graph to create edges from command to command. command_id_dag = nx.DiGraph() log.debug('Creating DAG of commands...') for idx in [idx for idx in uri_dag.nodes if (type(idx) == int)]: command_id_dag.add_node(idx) for upstream_idx in [ edge2[0] for edge1 in uri_dag.in_edges(idx) for edge2 in uri_dag.in_edges(edge1[0]) ]: command_id_dag.add_edge(upstream_idx, idx) # Feed this digraph of commands to the child runner. self.command_definitions = command_definitions self.command_id_dag = command_id_dag
def train(self, tmp_dir: str) -> None: """Train a DeepLab model the task and backend config. Args: tmp_dir: (str) temporary directory to use Returns: None """ train_py = self.backend_config.script_locations.train_py eval_py = self.backend_config.script_locations.eval_py export_py = self.backend_config.script_locations.export_py # Setup local input and output directories log.info('Setting up local input and output directories') train_logdir = self.backend_config.training_output_uri train_logdir_local = get_local_path(train_logdir, tmp_dir) dataset_dir = get_record_dir(self.backend_config.training_data_uri, TRAIN) dataset_dir_local = get_local_path(dataset_dir, tmp_dir) make_dir(tmp_dir) make_dir(train_logdir_local) make_dir(dataset_dir_local) # Download training data log.info('Downloading training data') for i, record_file in enumerate(list_paths(dataset_dir)): download_if_needed(record_file, tmp_dir) # Download and untar initial checkpoint. log.info('Downloading and untarring initial checkpoint') tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri download_if_needed(tf_initial_checkpoints_uri, tmp_dir) tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir) tfic_dir = os.path.dirname(tfic_tarball) with tarfile.open(tfic_tarball, 'r:gz') as tar: tar.extractall(tfic_dir) tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0] tfic_ckpt = tfic_ckpt[0:-len('.index')] # Restart support train_restart_dir = self.backend_config.train_options.train_restart_dir if type(train_restart_dir) is not str or len(train_restart_dir) == 0: train_restart_dir = train_logdir # Get output from potential previous run so we can resume training. if type(train_restart_dir) is str and len( train_restart_dir ) > 0 and not self.backend_config.train_options.replace_model: sync_from_dir(train_restart_dir, train_logdir_local) # Need to update model_checkpoint_path in the checkpoint file, # since it has the absolute paths from the previous run which # was using a different temporary directory on another machine. # If Deeplab could save relative paths instead (like the Object # Detection API does), then we wouldn't need to do this. checkpoint_path = join(train_logdir_local, 'checkpoint') if file_exists(checkpoint_path): latest_checkpoint = get_latest_checkpoint(train_logdir_local) with open(checkpoint_path, 'w') as cf: cf.write('model_checkpoint_path: \"{}\"'.format( latest_checkpoint)) else: if self.backend_config.train_options.replace_model: if os.path.exists(train_logdir_local): shutil.rmtree(train_logdir_local) make_dir(train_logdir_local) # Periodically synchronize with remote sync = start_sync( train_logdir_local, train_logdir, sync_interval=self.backend_config.train_options.sync_interval) with sync: # Setup TFDL config tfdl_config = json_format.ParseDict( self.backend_config.tfdl_config, TrainingParametersMsg()) log.info('tfdl_config={}'.format(tfdl_config)) log.info('Training steps={}'.format( tfdl_config.training_number_of_steps)) # Additional training options max_class = max( list(map(lambda c: c.id, self.class_map.get_items()))) num_classes = len(self.class_map.get_items()) num_classes = max(max_class, num_classes) + 1 (train_args, train_env) = get_training_args( train_py, train_logdir_local, tfic_ckpt, dataset_dir_local, num_classes, tfdl_config) # Start training log.info('Starting training process') log.info(' '.join(train_args)) train_process = Popen(train_args, env=train_env) terminate_at_exit(train_process) if self.backend_config.train_options.do_monitoring: # Start tensorboard log.info('Starting tensorboard process') tensorboard_process = Popen( ['tensorboard', '--logdir={}'.format(train_logdir_local)]) terminate_at_exit(tensorboard_process) if self.backend_config.train_options.do_eval: # Start eval script log.info('Starting eval script') eval_logdir = train_logdir_local eval_args = get_evaluation_args(eval_py, train_logdir_local, dataset_dir_local, eval_logdir, tfdl_config) eval_process = Popen(eval_args, env=train_env) terminate_at_exit(eval_process) # Wait for training and tensorboard log.info('Waiting for training and tensorboard processes') train_process.wait() if self.backend_config.train_options.do_monitoring: tensorboard_process.terminate() # Export frozen graph log.info( 'Exporting frozen graph ({}/model)'.format(train_logdir_local)) export_args = get_export_args(export_py, train_logdir_local, num_classes, tfdl_config) export_process = Popen(export_args) terminate_at_exit(export_process) export_process.wait() # Package up the model files for usage as fine tuning checkpoints fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name latest_checkpoints = get_latest_checkpoint(train_logdir_local) model_checkpoint_files = glob.glob( '{}*'.format(latest_checkpoints)) inference_graph_path = os.path.join(train_logdir_local, 'model') with RVConfig.get_tmp_dir() as tmp_dir: model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name) make_dir(model_dir) model_tar = os.path.join( train_logdir_local, '{}.tar.gz'.format(fine_tune_checkpoint_name)) shutil.copy(inference_graph_path, '{}/frozen_inference_graph.pb'.format(model_dir)) for path in model_checkpoint_files: shutil.copy(path, model_dir) with tarfile.open(model_tar, 'w:gz') as tar: tar.add(model_dir, arcname=os.path.basename(model_dir)) # Perform final sync sync_to_dir(train_logdir_local, train_logdir, delete=False)