Пример #1
0
    def _classify_imgs(self, img_filepaths, clf, output_dir):
        pred_imgs_lazy = []
        pred_img_filepaths = []
        for img_filepath in img_filepaths:
            # filename, ext = path.splitext(path.basename(img_filepath))
            # pred_img_filepath = path.join(
            #     output_dir, f"{filename}-pred{ext}")
            pred_img_filepath = path.join(output_dir,
                                          path.basename(img_filepath))
            pred_imgs_lazy.append(
                dask.delayed(self.classify_img)(img_filepath, clf,
                                                pred_img_filepath))
            pred_img_filepaths.append(pred_img_filepath)

        with diagnostics.ProgressBar():
            dask.compute(*pred_imgs_lazy)

        return pred_img_filepaths
Пример #2
0
def make_confusion_df(
    lidar_gdf,
    lidar_raw_dir,
    split_df=None,
    img_filepaths=None,
    n=None,
    frac=0.05,
    clf=None,
    clf_dict=None,
):

    c = dtr.Classifier()
    truth_pred_lazy = []
    if clf is not None:
        if split_df is None:
            num_validation_tiles = int(frac * len(img_filepaths))
            test_filepaths = random.choices(img_filepaths,
                                            k=num_validation_tiles)
        else:
            test_filepaths = _get_validation_df(split_df, n,
                                                frac)["img_filepath"]

        for img_filepath in test_filepaths:
            truth_pred_lazy.append(
                dask.delayed(_inner_loop)(img_filepath, lidar_gdf,
                                          lidar_raw_dir, c, clf))
    else:
        validation_df = _get_validation_df(split_df, n, frac)
        for img_cluster, cluster_df in validation_df.groupby("img_cluster"):
            clf = clf_dict[img_cluster]
            for img_filepath in cluster_df["img_filepath"]:
                truth_pred_lazy.append(
                    dask.delayed(_inner_loop)(img_filepath, lidar_gdf,
                                              lidar_raw_dir, c, clf))

    with diagnostics.ProgressBar():
        truth_pred = np.hstack(dask.compute(*truth_pred_lazy))

    truth_ser = pd.Series(truth_pred[0], name="actual")
    pred_ser = pd.Series(truth_pred[1], name="predicted")
    return pd.crosstab(truth_ser, pred_ser) / len(truth_ser)
Пример #3
0
    def descr_feature_matrix(self):
        try:
            return self._descr_feature_matrix
        except AttributeError:
            kernels = filters.get_gabor_filter_bank(
                frequencies=self.gabor_frequencies,
                num_orientations=self.gabor_num_orientations)

            # num_blocks = self.response_bins_per_axis**2

            # feature_rows = [
            #      TrainingSelector._get_image_descr(
            #          img_filepath, kernels, self.response_bins_per_axis,
            #          num_blocks, self.num_color_bins)
            #      for img_filepath in self.img_filepaths
            #  ]
            values = [
                dask.delayed(
                    image_descriptor.compute_image_descriptor_from_filepath)(
                        img_filepath, kernels, self.response_bins_per_axis,
                        self.num_color_bins)
                for img_filepath in self.img_filepaths
            ]

            with diagnostics.ProgressBar():
                feature_rows = dask.compute(*values)

            self._descr_feature_matrix = np.vstack(feature_rows)

            # TODO: cache as instance attribute (or even use property with and
            # pass this method's arguments to init), and then let people
            # interactively choose the number of PCA components until they're
            # happy with the represented variance? I vote yes.
            # TODO: cache this (via persistence): if `img_filepaths` and the
            # technical parameters coincide, load from a file instead of
            # recomputing it
            # TODO: return copy?
            return self._descr_feature_matrix
Пример #4
0
    def train_classifiers(self, split_df, response_img_dir):
        """
        Train a classifier for each first-level cluster in `split_df`. See the
        `background <https://bit.ly/2KlCICO>`_ example notebook for more
        details.

        Parameters
        ----------
        split_df : pandas DataFrame
            Data frame with the train/test split, which must have an
            `img_cluster` column with the first-level cluster labels.
        response_img_dir : str representing path to a directory
            Path to the directory where the response tiles are located.

        Returns
        -------
        clf_dict : dictionary
            Dictionary mapping a scikit-learn AdaBoostClassifier to each
            first-level cluster label
        """
        if 'img_cluster' not in split_df:
            raise ValueError(
                "`split_df` must have an 'img_cluster' column ('cluster-II'). "
                "For 'cluster-I', use `train_classifier`.")

        clfs_lazy = {}
        for img_cluster, _ in split_df.groupby('img_cluster'):
            clfs_lazy[img_cluster] = dask.delayed(self.train_classifier)(
                split_df=split_df,
                response_img_dir=response_img_dir,
                method='cluster-II',
                img_cluster=img_cluster)

        with diagnostics.ProgressBar():
            clfs_dict = dask.compute(clfs_lazy)[0]

        return clfs_dict
Пример #5
0
def convert(data, fname_data, df_artefacts=None, fname_uncorrected=None):
    """Convert TIFF files from 2p dataset in HDF5.  Optionally create artefact-removed dataset."""
    # Important: code expects no chunking in z, y, z -- need to have -1 for these dimensions.
    data = data.rechunk(
        (64, -1, -1,
         -1))  # 64 frames will be processed together for artefact removal.

    with diagnostics.ProgressBar():
        if df_artefacts is None:
            logger.info('Writing data to %s', fname_data)
            unlink(fname_data)
            os.makedirs(fname_data.parent, exist_ok=True)
            data.to_hdf5(fname_data, HDF5_KEY)
        else:
            # This writes 2 hdf5 files, where the 2nd one depends on the same data being
            # written to the first.  Ideally, both would be written simultaneously, but
            # that cannot be done using dask.  Instead, the 1st file is written and then
            # read back to write the 2nd one.
            logger.info('Writing uncorrected data to %s', fname_uncorrected)
            unlink(fname_uncorrected)
            os.makedirs(fname_uncorrected.parent, exist_ok=True)
            data.to_hdf5(fname_uncorrected, HDF5_KEY)

            logger.info('Writing corrected data to %s', fname_data)
            with h5py.File(fname_uncorrected, 'r') as hfile:
                arr = da.from_array(hfile[HDF5_KEY])
                # Depth of 1 in the first coordinate means to bring in the frames before and after
                # the chunk -- needed for doing diffs.
                depth = (1, 0, 0, 0)
                data_corrected = arr.map_overlap(remove_artefacts,
                                                 depth=depth,
                                                 dtype=data.dtype,
                                                 df=df_artefacts,
                                                 mydepth=depth)
                unlink(fname_data)
                os.makedirs(fname_data.parent, exist_ok=True)
                data_corrected.to_hdf5(fname_data, HDF5_KEY)
Пример #6
0
map_output = line_bag.map(line_to_words).flatten()
map_output

# In[11]:

# we cheat a bit for the reduce step
reduce_output = map_output.frequencies()
top10 = reduce_output.topk(10, lambda x: x[1])
bot10 = reduce_output.topk(10, lambda x: -x[1])

# In[12]:

import dask.diagnostics as diag

with diag.ProgressBar(), diag.Profiler() as prof, diag.ResourceProfiler(
        0.5) as rprof:
    print("Top 10\n", top10.compute(num_workers=4))
    print("Bottom 10\n", bot10.compute(num_workers=4))

# In[13]:

diag.visualize([prof, rprof])

# # Hadoop
#
#
# Hadoop is the opensource version of MapReduce developed by Yahoo and released as an Apache project. It provides underlying infrastructure and filesystem that handles storing and distributing data so each machine stores some of the data locally and processing jobs run where the data is stored.
# - Non-local data is copied over the network.
# - Storage is automatically expanded with processing power.
# - It's how Amazon, Microsoft, Yahoo, Facebook, ... deal with exabytes of data
Пример #7
0
def simulate_scenario_t_da(scenario_lulc_da,
                           biophysical_table_filepath,
                           ref_et_raster_filepath,
                           t_ref,
                           uhi_max,
                           ucm_params,
                           dst_t_dtype='float32',
                           rio_meta=None,
                           cc_method='factors'):
    if rio_meta is None:
        x = scenario_lulc_da['x'].values
        y = scenario_lulc_da['y'].values
        west = x[0]
        north = y[0]
        # TODO: does the method to get the transform work for all grids, i.e.,
        # regardless of whether the origin is in the upper left or lower left?
        rio_meta = dict(driver='GTiff',
                        dtype=scenario_lulc_da.dtype,
                        nodata=scenario_lulc_da.attrs['nodata'],
                        width=len(x),
                        height=len(y),
                        count=1,
                        crs=scenario_lulc_da.attrs['pyproj_srs'],
                        transform=transform.from_origin(
                            west, north, x[1] - west, north - y[1]))

    # define the function here so that the fixed arguments are curried
    def _t_from_lulc(lulc_arr):
        with tempfile.TemporaryDirectory() as tmp_dir:
            lulc_raster_filepath = path.join(tmp_dir, 'lulc.tif')
            with rio.open(lulc_raster_filepath, 'w', **rio_meta) as dst:
                dst.write(lulc_arr, 1)
            ucm_wrapper = iuc.UCMWrapper(lulc_raster_filepath,
                                         biophysical_table_filepath,
                                         cc_method,
                                         ref_et_raster_filepath,
                                         t_ref,
                                         uhi_max,
                                         extra_ucm_args=ucm_params,
                                         workspace_dir=tmp_dir)
            return ucm_wrapper.predict_t_arr(0)

    scenario_t_da = xr.DataArray(
        dims=scenario_lulc_da.dims,
        coords=scenario_lulc_da.coords,
        attrs=dict(nodata=np.nan,
                   pyproj_srs=scenario_lulc_da.attrs['pyproj_srs']))

    change_nums = scenario_t_da['change_num'].values
    scenario_runs = scenario_t_da.coords.get('scenario_run', None)

    def _simulate_and_repeat(change_num):
        # simulate once and repeat it for all scenario runs
        lulc_da = scenario_lulc_da.sel(change_num=change_num)
        if scenario_runs is not None:
            t_arr = _t_from_lulc(lulc_da.isel(scenario_run=0))
            t_arr = np.array(
                [t_arr for scenario_run in scenario_t_da['scenario_run']],
                dtype=dst_t_dtype)
        else:
            t_arr = _t_from_lulc(lulc_da)
        return t_arr

    if change_nums[0] == 0:
        scenario_t_da.loc[dict(change_num=0)] = _simulate_and_repeat(0)
        change_nums = change_nums[1:]

    scenario_dims = scenario_lulc_da.dims[:-2]
    stacked_da = scenario_lulc_da.sel(change_num=change_nums).stack(
        scenario=scenario_dims).transpose('scenario', 'y', 'x')
    with diagnostics.ProgressBar():
        scenario_t_da.loc[dict(change_num=change_nums)] = xr.DataArray(
            np.array(
                dask.compute(*[
                    dask.delayed(_t_from_lulc)(_scenario_lulc_da)
                    for _scenario_lulc_da in stacked_da
                ],
                             scheduler='processes')).astype(dst_t_dtype),
            dims=stacked_da.dims,
            coords={dim: stacked_da.coords[dim]
                    for dim in stacked_da.dims},
            attrs=dict(dtype=dst_t_dtype)).unstack(dim='scenario').transpose(
                *scenario_dims, 'y', 'x')
    # replace nodata values - UCM/InVEST uses minus infinity, so we can use
    # temperatures lower than the absolute zero as a reference threshold which
    # (physically) makes sense
    return scenario_t_da.where(scenario_t_da > -273.15, np.nan)
Пример #8
0
    def build_features(self,
                       split_df=None,
                       img_filepaths=None,
                       img_dir=None,
                       img_filename_pattern=None,
                       method=None,
                       img_cluster=None):
        """
        Build the pixel features for a list of images

        Parameters
        -------
        split_df : pd.DataFrame
            Data frame
        img_filepaths : list of image file paths, optional
            List of images to be transformed into features. Alternatively, the
            same information can be provided by means of the `img_dir` and
            `img_filename_pattern` keyword arguments. Ignored if providing
            `split_df`
        img_dir : str representing path to a directory, optional
            Path to the directory where the images whose filename matches
            `img_filename_pattern` are to be located. Ignored if `split_df` or
            `img_filepaths` is provided.
        img_filename_pattern : str representing a file-name pattern, optional
            Filename pattern to be matched in order to obtain the list of
            images. If no value is provided, the default value set in
            `settings.IMG_DEFAULT_FILENAME_PATTERN` will be taken. Ignored if
            `split_df` or `img_filepaths` is provided.
        method : {'cluster-I', 'cluster-II'}, optional
            Method used in the train/test split
        img_cluster : int, optional
            The label of the cluster of images. Only used if `method` is
            'cluster-II'
        Returns
        -------
        X : np.ndarray
            Array with the pixel features
        """
        # TODO: accept `neighborhoods` kwarg
        if split_df is not None:
            if method is None:
                if 'img_cluster' in split_df:
                    method = 'cluster-II'
                else:
                    method = 'cluster-I'

            if method == 'cluster-I':
                # dump_train_feature_arrays(split_df, output_filepath)
                img_filepaths = split_df[split_df['train']]['img_filepath']
            else:
                if img_cluster is None:
                    raise ValueError(
                        "If `method` is 'cluster-II', `img_cluster` must be "
                        "provided")
                img_filepaths = utils.get_img_filepaths(
                    split_df, img_cluster, True)

        else:
            if img_filepaths is None:
                if img_filename_pattern is None:
                    img_filename_pattern = \
                        settings.IMG_DEFAULT_FILENAME_PATTERN
                if img_dir is None:
                    raise ValueError(
                        "Either `split_df`, `img_filepaths` or `img_dir` must "
                        "be provided")

                img_filepaths = glob.glob(
                    path.join(img_dir, img_filename_pattern))

        values = [
            dask.delayed(self.build_features_from_filepath)(img_filepath)
            for img_filepath in img_filepaths
        ]

        with diagnostics.ProgressBar():
            X = dask.compute(*values)

        return np.vstack(X)
Пример #9
0
# In[ ]:


# get image dimensions
def get_dims(file):
    img = cv2.imread(file)
    h, w = img.shape[:2]
    return h, w


# parallelize
filepath = '../input/stage_1_test_images/'
filelist = [filepath + f for f in os.listdir(filepath)]
dimsbag = bag.from_sequence(filelist).map(get_dims)
with diagnostics.ProgressBar():
    dims = dimsbag.compute()

dim_df = pd.DataFrame(dims, columns=['height', 'width'])
sizes = dim_df.groupby(['height', 'width'
                        ]).size().reset_index().rename(columns={0: 'count'})
sizes.hvplot.scatter(x='height',
                     y='width',
                     size='count',
                     xlim=(0, 1200),
                     ylim=(0, 1200),
                     grid=True,
                     xticks=2,
                     yticks=2,
                     height=500,
                     width=600).options(scaling_factor=0.1,