Пример #1
0
    def test_file_exists(self):
        path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt')
        s3_path = 's3://{}/xxx/lorem.txt'.format(self.bucket_name)
        s3_directory = 's3://{}/xxx/'.format(self.bucket_name)
        directory = os.path.dirname(path)
        make_dir(directory, check_empty=False)

        str_to_file(self.lorem, path)
        upload_or_copy(path, s3_path)

        self.assertTrue(file_exists(s3_directory, include_dir=True))
        self.assertTrue(file_exists(s3_path, include_dir=False))
        self.assertFalse(file_exists(s3_directory, include_dir=False))
        self.assertFalse(
            file_exists(s3_directory + 'NOTPOSSIBLE', include_dir=False))
Пример #2
0
    def test_file_exists_local_true(self):
        path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt')
        directory = os.path.dirname(path)
        make_dir(directory, check_empty=False)

        str_to_file(self.lorem, path)

        self.assertTrue(file_exists(path))
Пример #3
0
        def make_train_scenes(item):
            # We can easily construct the uri using the root directory of the training STAC.
            # Each rv scene will consist of one portion of an image created in the preprocessing
            # stage but it is not necessary to split the labels up in the same way. We can
            # pass a scene's label uri to all of it's child images and rv will subset the
            # labels automatically.
            area = item.get_parent().id
            label_uri = join(dirname(train_stac_uri), area,
                             '{}-labels'.format(item.id),
                             '{}.geojson'.format(item.id))

            # If you preprocessed the imagery using the 'PREPROCESS' aux command, the
            # image splits will have integer suffixes that ascend from 0 until the image
            # has been completely covered. To make sure we get all of the splits we will
            # incrementally create new uri's until one is found to not exist.
            i = 0
            images_remaining = True
            scenes = []
            while images_remaining:
                raster_uri = join(train_img_dir, area, item.id,
                                  '{}_{}.tif'.format(item.id, i))

                if file_exists(raster_uri):
                    # construct a raster source (i.e. the image)
                    raster_source = rv.RasterSourceConfig.builder(rv.RASTERIO_SOURCE) \
                        .with_uri(raster_uri) \
                        .with_channel_order([0, 1, 2]) \
                        .build()

                    # construct a label source (i.e. the scene's geojson labels)
                    # The with_rasterizer_options method sets the default pixel value to use
                    # for background pixels in prediction. The value of 0 is reserved for nodata
                    # pixels in rv so we will need to add a postprocessing step at the end
                    # in order for the test set predicitons to match the competition submission
                    # suidelines.
                    label_raster_source = rv.RasterSourceConfig.builder(rv.RASTERIZED_SOURCE) \
                        .with_vector_source(label_uri) \
                        .with_rasterizer_options(2) \
                        .build()

                    label_source = rv.LabelSourceConfig.builder(rv.SEMANTIC_SEGMENTATION) \
                        .with_raster_source(label_raster_source) \
                        .build()

                    # Build scene config
                    scene = rv.SceneConfig.builder() \
                        .with_task(task) \
                        .with_id('{}_{}'.format(item.id, i)) \
                        .with_raster_source(raster_source) \
                        .with_label_source(label_source) \
                        .build()

                    scenes.append(scene)
                else:
                    images_remaining = False
                i += 1

            return scenes
Пример #4
0
    def test_file_exists_s3_true(self):
        path = os.path.join(self.temp_dir.name, 'lorem', 'ipsum.txt')
        directory = os.path.dirname(path)
        make_dir(directory, check_empty=False)

        str_to_file(self.lorem, path)

        s3_path = 's3://{}/lorem.txt'.format(self.bucket_name)
        upload_or_copy(path, s3_path)

        self.assertTrue(file_exists(s3_path))
Пример #5
0
    def test_copy_from_http(self):
        http_path = ('https://raw.githubusercontent.com/tensorflow/models/'
                     '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md')
        expected = os.path.join(
            self.temp_dir.name, 'http', 'raw.githubusercontent.com',
            'tensorflow/models',
            '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/README.md')
        download_if_needed(http_path, self.temp_dir.name)

        self.assertTrue(file_exists(expected))
        os.remove(expected)
Пример #6
0
def make_train_scenes(item, train_stac_uri, train_img_dir, which="all"):
    area = item.get_parent().id
    label_uri = join(
        dirname(train_stac_uri), area, f"{item.id}-labels", f"{item.id}.geojson"
    )

    scenes = []
    if which == "all":
        i = 0
        images_remaining = True
        while images_remaining:
            raster_uri = join(train_img_dir, area, item.id, f"{item.id}_{i}.tif")
            if file_exists(raster_uri):
                scenes.append(make_train_window(raster_uri, label_uri, item.id, i))
            else:
                images_remaining = False
            i += 1
    else:
        for i in which:
            raster_uri = join(train_img_dir, area, item.id, f"{item.id}_{i}.tif")
            scenes.append(make_train_window(raster_uri, label_uri, item.id, i))

    return scenes
    def __init__(self,
                 uri,
                 extent,
                 crs_transformer,
                 tmp_dir,
                 vector_output=None,
                 class_map=None):
        """Constructor.

        Args:
            uri: (str) URI of GeoTIFF file used for storing predictions as RGB values
            extent: (Box) The extent of the scene
            crs_transformer: (CRSTransformer)
            tmp_dir: (str) temp directory to use
            vector_output: (None or array of dicts) containing vectorifiction
                configuration information
            class_map: (ClassMap) with color values used to convert class ids to
                RGB values

        """
        self.uri = uri
        self.vector_output = vector_output
        self.extent = extent
        self.crs_transformer = crs_transformer
        self.tmp_dir = tmp_dir
        # Note: can't name this class_transformer due to Python using that attribute
        if class_map:
            self.class_trans = SegmentationClassTransformer(class_map)
        else:
            self.class_trans = None

        self.source = None
        if file_exists(uri):
            self.source = rv.RasterSourceConfig.builder(rv.RASTERIO_SOURCE) \
                                               .with_uri(self.uri) \
                                               .build() \
                                               .create_source(self.tmp_dir)
Пример #8
0
def save_image_crop(image_uri,
                    image_crop_uri,
                    label_uri=None,
                    label_crop_uri=None,
                    size=600,
                    min_features=10,
                    vector_labels=True):
    """Save a crop of an image to use for testing.

    If label_uri is set, the crop needs to cover >= min_features.

    Args:
        image_uri: URI of original image
        image_crop_uri: URI of cropped image to save
        label_uri: optional URI of label file
        label_crop_uri: optional URI of cropped labels to save
        size: height and width of crop

    Raises:
        ValueError if cannot find a crop satisfying min_features constraint.
    """
    if not file_exists(image_crop_uri):
        print('Saving test crop to {}...'.format(image_crop_uri))
        old_environ = os.environ.copy()
        try:
            request_payer = S3FileSystem.get_request_payer()
            if request_payer == 'requester':
                os.environ['AWS_REQUEST_PAYER'] = request_payer
            im_dataset = rasterio.open(image_uri)
            h, w = im_dataset.height, im_dataset.width

            extent = Box(0, 0, h, w)
            windows = extent.get_windows(size, size)
            if label_uri and vector_labels:
                crs_transformer = RasterioCRSTransformer.from_dataset(
                    im_dataset)
                vs = GeoJSONVectorSource(label_uri, crs_transformer)
                geojson = vs.get_geojson()
                geoms = []
                for f in geojson['features']:
                    g = shape(f['geometry'])
                    geoms.append(g)
                tree = STRtree(geoms)

            def p2m(x, y, z=None):
                return crs_transformer.pixel_to_map((x, y))

            for w in windows:
                use_window = True
                if label_uri and vector_labels:
                    w_polys = tree.query(w.to_shapely())
                    use_window = len(w_polys) >= min_features
                    if use_window and label_crop_uri is not None:
                        print('Saving test crop labels to {}...'.format(
                            label_crop_uri))

                        label_crop_features = [
                            mapping(shapely.ops.transform(p2m, wp))
                            for wp in w_polys
                        ]
                        label_crop_json = {
                            'type':
                            'FeatureCollection',
                            'features': [{
                                'geometry': f
                            } for f in label_crop_features]
                        }
                        json_to_file(label_crop_json, label_crop_uri)

                if use_window:
                    crop_image(image_uri, w, image_crop_uri)

                    if not vector_labels and label_uri and label_crop_uri:
                        crop_image(label_uri, w, label_crop_uri)

                    break

            if not use_window:
                raise ValueError('Could not find a good crop.')
        finally:
            os.environ.clear()
            os.environ.update(old_environ)
Пример #9
0
 def test_file_exists_s3_false(self):
     s3_path = 's3://{}/hello.txt'.format(self.bucket_name)
     self.assertFalse(file_exists(s3_path))
Пример #10
0
    def test_file_exists_local_false(self):
        path = os.path.join(self.temp_dir.name, 'hello', 'hello.txt')
        directory = os.path.dirname(path)
        make_dir(directory, check_empty=False)

        self.assertFalse(file_exists(path))
Пример #11
0
 def test_file_exists_http_false(self):
     http_path = ('https://raw.githubusercontent.com/tensorflow/models/'
                  '17fa52864bfc7a7444a8b921d8a8eb1669e14ebd/XXX')
     self.assertFalse(file_exists(http_path))
def save_image_crop(image_uri,
                    crop_uri,
                    label_uri=None,
                    size=600,
                    min_features=10):
    """Save a crop of an image to use for testing.

    If label_uri is set, the crop needs to cover >= min_features.

    Args:
        image_uri: URI of original image
        crop_uri: URI of cropped image to save
        label_uri: optional URI of GeoJSON file
        size: height and width of crop

    Raises:
        ValueError if cannot find a crop satisfying min_features constraint.
    """
    if not file_exists(crop_uri):
        print('Saving test crop to {}...'.format(crop_uri))
        old_environ = os.environ.copy()
        try:
            request_payer = S3FileSystem.get_request_payer()
            if request_payer == 'requester':
                os.environ['AWS_REQUEST_PAYER'] = request_payer
            im_dataset = rasterio.open(image_uri)
            h, w = im_dataset.height, im_dataset.width

            extent = Box(0, 0, h, w)
            windows = extent.get_windows(size, size)
            if label_uri is not None:
                crs_transformer = RasterioCRSTransformer.from_dataset(
                    im_dataset)
                vs = GeoJSONVectorSource(label_uri, crs_transformer)
                geojson = vs.get_geojson()
                geoms = []
                for f in geojson['features']:
                    g = shape(f['geometry'])
                    geoms.append(g)
                tree = STRtree(geoms)

            for w in windows:
                use_window = True
                if label_uri is not None:
                    w_polys = tree.query(w.to_shapely())
                    use_window = len(w_polys) >= min_features

                if use_window:
                    w = w.rasterio_format()
                    im = im_dataset.read(window=w)

                    if np.mean(np.sum(im, axis=2).ravel() == 0) < 0.9:
                        with tempfile.TemporaryDirectory() as tmp_dir:
                            crop_path = get_local_path(crop_uri, tmp_dir)
                            make_dir(crop_path, use_dirname=True)

                            meta = im_dataset.meta
                            meta['width'], meta['height'] = size, size
                            meta['transform'] = rasterio.windows.transform(
                                w, im_dataset.transform)

                            with rasterio.open(crop_path, 'w', **meta) as dst:
                                dst.colorinterp = im_dataset.colorinterp
                                dst.write(im)

                            upload_or_copy(crop_path, crop_uri)
                        break

            if not use_window:
                raise ValueError('Could not find a good crop.')
        finally:
            os.environ.clear()
            os.environ.update(old_environ)
Пример #13
0
    def __init__(self,
                 command_definitions,
                 rerun_commands=False,
                 skip_file_check=False):
        """Generates a CommandDAG from a list of CommandDefinitions

        This logic checks if there are any non-exsiting URIs that are
        not produced as outputs by some command in the set. If so,
        it raises a ConfigError stating the missing files.
        """
        # Create a set of edges, from input_uri to command_config and
        # from command_config to output_uri. Nodes for commands are their
        # index into command_definitions.

        uri_dag = nx.DiGraph()

        log.debug('Creating command and URI DAG from command definitions...')
        for idx, command_def in enumerate(command_definitions):
            uri_dag.add_node(idx)
            for input_uri in command_def.io_def.input_uris:
                uri_dag.add_edge(input_uri, idx)

            for output_uri in command_def.io_def.output_uris:
                uri_dag.add_edge(idx, output_uri)

        # Find all source input_uris, and ensure they exist.
        if not skip_file_check:
            log.debug('Ensuring input files exist...')
            unsolved_sources = [
                uri for uri in uri_dag.nodes
                if (type(uri) == str and len(uri_dag.in_edges(uri)) == 0)
            ]

            missing_files = []

            with click.progressbar(
                    unsolved_sources,
                    label='Ensuring input files exist  ') as uris:
                for uri in uris:
                    if not file_exists(uri):
                        missing_files.append(uri)

            if any(missing_files):
                raise rv.ConfigError(
                    'Files do not exist and are not supplied by commands:\n'
                    '\t{}\n'.format(',\b\t'.join(missing_files)))

        # If we are not rerunning, remove commands that have existing outputs.
        self.skipped_commands = []
        if not rerun_commands:
            log.debug('Checking for existing output...')
            commands_to_outputs = [(idx, edge[1]) for idx in uri_dag.nodes
                                   if type(idx) == int
                                   for edge in uri_dag.out_edges(idx)]
            with click.progressbar(
                    commands_to_outputs,
                    label='Checking for existing output') as lst:
                for idx, output_uri in lst:
                    if file_exists(output_uri):
                        uri_dag.remove_edge(idx, output_uri)

            for idx in set(map(lambda x: x[0], commands_to_outputs)):
                if len(uri_dag.out_edges(idx)) == 0:
                    self.skipped_commands.append(command_definitions[idx])
                    uri_dag.remove_node(idx)

        # Collapse the graph to create edges from command to command.
        command_id_dag = nx.DiGraph()

        log.debug('Creating DAG of commands...')
        for idx in [idx for idx in uri_dag.nodes if (type(idx) == int)]:
            command_id_dag.add_node(idx)
            for upstream_idx in [
                    edge2[0] for edge1 in uri_dag.in_edges(idx)
                    for edge2 in uri_dag.in_edges(edge1[0])
            ]:
                command_id_dag.add_edge(upstream_idx, idx)

        # Feed this digraph of commands to the child runner.
        self.command_definitions = command_definitions
        self.command_id_dag = command_id_dag
Пример #14
0
    def train(self, tmp_dir: str) -> None:
        """Train a DeepLab model the task and backend config.

        Args:
            tmp_dir: (str) temporary directory to use

        Returns:
             None
        """
        train_py = self.backend_config.script_locations.train_py
        eval_py = self.backend_config.script_locations.eval_py
        export_py = self.backend_config.script_locations.export_py

        # Setup local input and output directories
        log.info('Setting up local input and output directories')
        train_logdir = self.backend_config.training_output_uri
        train_logdir_local = get_local_path(train_logdir, tmp_dir)
        dataset_dir = get_record_dir(self.backend_config.training_data_uri,
                                     TRAIN)
        dataset_dir_local = get_local_path(dataset_dir, tmp_dir)
        make_dir(tmp_dir)
        make_dir(train_logdir_local)
        make_dir(dataset_dir_local)

        # Download training data
        log.info('Downloading training data')
        for i, record_file in enumerate(list_paths(dataset_dir)):
            download_if_needed(record_file, tmp_dir)

        # Download and untar initial checkpoint.
        log.info('Downloading and untarring initial checkpoint')
        tf_initial_checkpoints_uri = self.backend_config.pretrained_model_uri
        download_if_needed(tf_initial_checkpoints_uri, tmp_dir)
        tfic_tarball = get_local_path(tf_initial_checkpoints_uri, tmp_dir)
        tfic_dir = os.path.dirname(tfic_tarball)
        with tarfile.open(tfic_tarball, 'r:gz') as tar:
            tar.extractall(tfic_dir)
        tfic_ckpt = glob.glob('{}/*/*.index'.format(tfic_dir))[0]
        tfic_ckpt = tfic_ckpt[0:-len('.index')]

        # Restart support
        train_restart_dir = self.backend_config.train_options.train_restart_dir
        if type(train_restart_dir) is not str or len(train_restart_dir) == 0:
            train_restart_dir = train_logdir

        # Get output from potential previous run so we can resume training.
        if type(train_restart_dir) is str and len(
                train_restart_dir
        ) > 0 and not self.backend_config.train_options.replace_model:
            sync_from_dir(train_restart_dir, train_logdir_local)

            # Need to update model_checkpoint_path in the checkpoint file,
            # since it has the absolute paths from the previous run which
            # was using a different temporary directory on another machine.
            # If Deeplab could save relative paths instead (like the Object
            # Detection API does), then we wouldn't need to do this.
            checkpoint_path = join(train_logdir_local, 'checkpoint')
            if file_exists(checkpoint_path):
                latest_checkpoint = get_latest_checkpoint(train_logdir_local)
                with open(checkpoint_path, 'w') as cf:
                    cf.write('model_checkpoint_path: \"{}\"'.format(
                        latest_checkpoint))
        else:
            if self.backend_config.train_options.replace_model:
                if os.path.exists(train_logdir_local):
                    shutil.rmtree(train_logdir_local)
                make_dir(train_logdir_local)

        # Periodically synchronize with remote
        sync = start_sync(
            train_logdir_local,
            train_logdir,
            sync_interval=self.backend_config.train_options.sync_interval)

        with sync:
            # Setup TFDL config
            tfdl_config = json_format.ParseDict(
                self.backend_config.tfdl_config, TrainingParametersMsg())
            log.info('tfdl_config={}'.format(tfdl_config))
            log.info('Training steps={}'.format(
                tfdl_config.training_number_of_steps))

            # Additional training options
            max_class = max(
                list(map(lambda c: c.id, self.class_map.get_items())))
            num_classes = len(self.class_map.get_items())
            num_classes = max(max_class, num_classes) + 1
            (train_args, train_env) = get_training_args(
                train_py, train_logdir_local, tfic_ckpt, dataset_dir_local,
                num_classes, tfdl_config)

            # Start training
            log.info('Starting training process')
            log.info(' '.join(train_args))
            train_process = Popen(train_args, env=train_env)
            terminate_at_exit(train_process)

            if self.backend_config.train_options.do_monitoring:
                # Start tensorboard
                log.info('Starting tensorboard process')
                tensorboard_process = Popen(
                    ['tensorboard', '--logdir={}'.format(train_logdir_local)])
                terminate_at_exit(tensorboard_process)

            if self.backend_config.train_options.do_eval:
                # Start eval script
                log.info('Starting eval script')
                eval_logdir = train_logdir_local
                eval_args = get_evaluation_args(eval_py, train_logdir_local,
                                                dataset_dir_local, eval_logdir,
                                                tfdl_config)
                eval_process = Popen(eval_args, env=train_env)
                terminate_at_exit(eval_process)

            # Wait for training and tensorboard
            log.info('Waiting for training and tensorboard processes')
            train_process.wait()
            if self.backend_config.train_options.do_monitoring:
                tensorboard_process.terminate()

            # Export frozen graph
            log.info(
                'Exporting frozen graph ({}/model)'.format(train_logdir_local))
            export_args = get_export_args(export_py, train_logdir_local,
                                          num_classes, tfdl_config)
            export_process = Popen(export_args)
            terminate_at_exit(export_process)
            export_process.wait()

            # Package up the model files for usage as fine tuning checkpoints
            fine_tune_checkpoint_name = self.backend_config.fine_tune_checkpoint_name
            latest_checkpoints = get_latest_checkpoint(train_logdir_local)
            model_checkpoint_files = glob.glob(
                '{}*'.format(latest_checkpoints))
            inference_graph_path = os.path.join(train_logdir_local, 'model')

            with RVConfig.get_tmp_dir() as tmp_dir:
                model_dir = os.path.join(tmp_dir, fine_tune_checkpoint_name)
                make_dir(model_dir)
                model_tar = os.path.join(
                    train_logdir_local,
                    '{}.tar.gz'.format(fine_tune_checkpoint_name))
                shutil.copy(inference_graph_path,
                            '{}/frozen_inference_graph.pb'.format(model_dir))
                for path in model_checkpoint_files:
                    shutil.copy(path, model_dir)
                with tarfile.open(model_tar, 'w:gz') as tar:
                    tar.add(model_dir, arcname=os.path.basename(model_dir))

        # Perform final sync
        sync_to_dir(train_logdir_local, train_logdir, delete=False)