Exemplo n.º 1
0
    def __init__(self, yaml_file: str, cluster=False):
        def _grp(d, k, msg=None):
            """
            Get required parameter.
            """
            try:
                return d[k]
            except KeyError:
                if msg is None:
                    msg = f"Required parameter {k} not present in config."
                _logger.exception(msg)
                raise

        Config._configure_pyyaml()
        with open(yaml_file, 'r') as f:
            s = yaml.load(f, Loader=Config.yaml_loader)
        self.name = path.basename(yaml_file).rsplit(".", 1)[0]

        # LEARNING BLOCK
        if not cluster:
            learn_block = _grp(s, 'learning')
            self.clustering = False
            self.cluster_analysis = False
            self.algorithm = _grp(
                learn_block, 'algorithm',
                "'algorithm' must be provided as part of 'learning' block.")

            self.algorithm_args = _grp(
                learn_block, 'arguments',
                "'arguments' must be provided for learning algorithm.")
        # CLUSTERING BLOCK
        else:
            cb = _grp(s, 'clustering',
                      "'clustering' block must be provided when clustering.")
            self.clustering = True
            self.algorithm = cb.get('algorithm', 'kmeans')
            self.n_classes = _grp(
                cb, 'n_classes',
                "'n_classes' must be provided when clustering.")
            self.oversample_factor = _grp(
                cb, 'oversample_factor',
                "'oversample_factor' must be provided when clustering.")
            self.cluster_analysis = cb.get('cluster_analysis', False)
            self.class_file = cb.get('file')
            if self.class_file:
                self.class_property = _grp(
                    cb, 'property', "'property' must be provided when "
                    "providing a file for semisupervised clustering.")
            self.semi_supervised = self.class_file is not None

        # Set flags based on algorithm being used - these control
        # some special behaviours in the code.
        self.cubist = self.algorithm == 'cubist'
        self.multicubist = self.algorithm == 'multicubist'
        self.multirandomforest = self.algorithm == 'multirandomforest'
        self.krige = self.algorithm == 'krige'

        # PICKLING BLOCK
        pk_block = s.get('pickling')
        if pk_block:
            self.pk_covariates = pk_block.get('covariates')
            self.pk_targets = pk_block.get('targets')

            # Load from pickle files if covariates and targets exist.
            self.pk_load = self.pk_covariates and os.path.exists(self.pk_covariates) \
                           and self.pk_targets and os.path.exists(self.pk_targets)

            if self.cubist or self.multicubist:
                self.pk_featurevec = pk_block.get('featurevec')
                # If running multicubist, we also need featurevec to load from pickle files.
                self.pk_load = self.pk_load \
                               and self.pk_featurevec and os.path.exists(self.pk_featurevec)
        else:
            self.pk_load = False
            self.pk_covariates = None
            self.pk_targets = None
            self.pk_featurevec = None

        # FEATURES BLOCK
        # Todo: fix get_image_spec so features are optional if using pickled data.
        # if not self.pk_load:
        _logger.warning(
            "'features' are required even when loading from pickled data - this is "
            "a work around for getting image specifications. Needs to be fixed."
        )
        features = _grp(
            s, 'features',
            "'features' block must be provided when not loading "
            "from pickled data.")
        self.feature_sets = [FeatureSetConfig(f) for f in features]

        # Not yet implemented.
        if 'patchsize' in s:
            _logger.info("Patchsize currently fixed at 0 -- ignoring")
        self.patchsize = 0

        # TARGET BLOCK
        if not self.pk_load:
            tb = _grp(
                s, 'targets',
                "'targets' block my be provided when not loading from "
                "pickled data.")
            self.target_file = _grp(
                tb, 'file', "'file' needs to be provided when specifying "
                "targets.")
            self.target_property = _grp(
                tb, 'property', "'property needs to be provided when "
                "specifying targets.")
            self.resample = tb.get('resample')

        # FINAL TRANSFORM BLOCK
        ftb = s.get('final_transform')
        if ftb is not None:
            _, im, trans_g = _parse_transform_set(ftb.get('transforms'),
                                                  ftb.get('imputation'))
            self.final_transform = transforms.TransformSet(im, trans_g)
        else:
            self.final_transform = None

        # VALIDATION BLOCK
        vb = s.get('validation')
        if vb:
            self.rank_features = vb.get('feature_rank', False)
            if self.pk_load and self.rank_features:
                _logger.warning(
                    "Feature ranking cannot be performed when loading covariates and "
                    "targets from pickled data.")
                self.rank_features = False
            self.permutation_importance = vb.get('permutation_importance',
                                                 False)
            kfb = vb.get('k-fold')
            if kfb:
                self.cross_validate = True
                self.folds = _grp(
                    kfb, 'folds',
                    "'folds' (number of folds) must be specified "
                    "if k-fold cross validation is being used.")
                self.crossval_seed = _grp(
                    kfb, 'random_seed', "'random_seed' must be specified "
                    "if k-fold cross validation is being used.")
                self.parallel_validate = kfb.get('parallel', False)
        else:
            self.cross_validate = False
            self.rank_features = False
            self.permutation_importance = False
            self.parallel_validate = False

        # OPTIMISATION BLOCK
        # Note: optimisation options get parsed in scripts/gridsearch.py
        self.optimisation = s.get('optimisation')

        # PREDICT BLOCK
        pb = _grp(s, 'prediction', "'prediction' block must be provided.")
        self.geotif_options = pb.get('geotif', {})
        self.quantiles = _grp(
            pb, 'quantiles', "'quantiles' must be provided as part of "
            "prediction block.")
        self.outbands = _grp(
            pb, 'outbands',
            "'outbands' must be provided as part of prediction "
            "block.")
        self.thumbnails = pb.get('thumbnails', 10)
        mb = s.get('mask')
        if mb:
            self.mask = mb.get('file')
            self.mask = None if not os.path.exists(self.mask) else self.mask
            if self.mask:
                self.retain = _grp(
                    mb, 'retain', "'retain' must be provided if providing a "
                    "prediction mask.")
        else:
            self.mask = None

        if self.krige:
            # Todo: don't know if lon/lat is compulsory or not for kriging
            self.lon_lat = s.get('lon_lat')
        else:
            self.lon_lat = None

        # OUTPUT BLOCK
        def _outpath(filename):
            return os.path.join(self.output_dir, self.name + f'{filename}')

        ob = _grp(s, 'output', "'output' block is required.")
        self.output_dir = _grp(ob, 'directory',
                               "'directory' for output is required.")
        self.model_file = ob.get('model', _outpath('.model'))

        if ob.get('plot_feature_ranks', False):
            self.plot_feature_ranks = _outpath('_featureranks.png')
            self.plot_feature_rank_curves = _outpath('_featurerank_curves.png')
        else:
            self.plot_feature_ranks = None
            self.plot_feature_rank = None

        if ob.get('plot_intersection', False):
            self.plot_intersection = _outpath('_intersected.png')
        else:
            self.plot_intersection = None

        if ob.get('plot_real_vs_pred', False):
            self.plot_real_vs_pred = _outpath('_real_vs_pred.png')
            self.plot_residual = _outpath('_residual.png')
        else:
            self.plot_real_vs_pred = None
            self.plot_residual = None

        if ob.get('plot_correlation', False):
            self.plot_correlation = _outpath('_correlation.png')
        else:
            self.plot_correlation = None

        if ob.get('plot_target_scaling', False):
            self.plot_target_scaling = _outpath('_target_scaling.png')
        else:
            self.plot_target_scaling = None

        self.raw_covariates = _outpath('_rawcovariates.csv')
        self.raw_covariates_mask = _outpath('_rawcovariates_mask.csv')
        self.feature_ranks_file = _outpath('_featureranks.json')
        self.crossval_scores_file = _outpath('_crossval_scores.json')
        self.crossval_results_file = _outpath('_crossval_results.csv')
        self.crossval_results_plot = _outpath('_crossval_results.png')
        self.dropped_targets_file = _outpath('_dropped_targets.txt')
        self.transformed_targets_file = _outpath('_transformed_targets.csv')
        self.metadata_file = _outpath('_metadata.txt')
        self.optimisation_results_file = _outpath('_optimisation.csv')
        self.prediction_file = _outpath('_{}.tif')

        paths = [self.output_dir, os.path.split(self.model_file)[0]]
        for p in paths:
            if p:
                makedirs(p, exist_ok=True)
Exemplo n.º 2
0
    def __init__(self, yaml_file):
        with open(yaml_file, 'r') as f:
            s = yaml.safe_load(f)
        self.name = path.basename(yaml_file).rsplit(".", 1)[0]

        # TODO expose this option when fixed
        if 'patchsize' in s:
            log.info("Patchsize currently fixed at 0 -- ignoring")
        self.patchsize = 0

        if 'intersected_features' in s:
            self.intersected_features = s['intersected_features']
        else:
            self.intersected_features = None

        if 'learning' in s:
            self.algorithm = s['learning']['algorithm']
            self.algorithm_args = s['learning']['arguments']
        else:
            self.algorithm = None

        self.cubist = self.algorithm == 'cubist'
        self.multicubist = self.algorithm == 'multicubist'
        self.multirandomforest = self.algorithm == 'multirandomforest'
        self.krige = self.algorithm == 'krige'

        if 'prediction' in s:
            self.quantiles = s['prediction']['quantiles']
            self.geotif_options = s['prediction']['geotif'] if 'geotif' in \
                s['prediction'] else {}
            self.outbands = None
            if 'outbands' in s['prediction']:
                self.outbands = s['prediction']['outbands']
            self.thumbnails = s['prediction']['thumbnails'] \
                if 'thumbnails' in s['prediction'] else 10

        if 'features' in s:
            self.pickle = any(True for d in s['features'] if d['type'] == 'pickle')
        else:
            self.pickle = False

        self.rawcovariates = False
        self.train_data_pk = False
        if self.pickle:
            self.pickle_load = True
            for n, d in enumerate(s['features']):
                if d['type'] == 'pickle':
                    if 'covariates' in d['files']:
                        self.pickled_covariates = \
                            path.abspath(d['files']['covariates'])
                    if 'targets' in d['files']:
                        self.pickled_targets = d['files']['targets']
                    if 'rawcovariates' in d['files']:
                        self.rawcovariates = d['files']['rawcovariates']
                        self.rawcovariates_mask = \
                            d['files']['rawcovariates_mask']
                    if 'train_data_pk' in d['files']:
                        self.train_data_pk = d['files']['train_data_pk']
                    if not (path.exists(d['files']['covariates'])
                            and path.exists(d['files']['targets'])):
                        self.pickle_load = False
                    if self.cubist or self.multicubist:
                        if 'featurevec' in d['files']:
                            self.featurevec = \
                                path.abspath(d['files']['featurevec'])
                        if not path.exists(d['files']['featurevec']):
                            self.pickle_load = False
                    if 'plot_covariates' in d['files']:
                        self.plot_covariates = d['files']['plot_covariates']
                    else:
                        self.plot_covariates = False
                    s['features'].pop(n)  # pop `pickle` features
        else:
            self.pickle_load = False


        if 'features' in s:
            self.feature_sets = [FeatureSetConfig(k) for k in s['features']]

        if 'preprocessing' in s:
            final_transform = s['preprocessing']
            if 'transforms' not in final_transform:
                final_transform['transforms'] = None
            if 'imputation' not in final_transform:
                final_transform['imputation'] = None
            _, im, trans_g = _parse_transform_set(
                final_transform['transforms'], final_transform['imputation'])
            self.final_transform = transforms.TransformSet(im, trans_g)
        else:
            self.final_transform = None

        self.output_dir = s['output']['directory']
        # create output dir if does not exist
        makedirs(self.output_dir, exist_ok=True)

        if 'oos_validation' in s:
            self.oos_validation_file = s['oos_validation']['file']
            self.oos_validation_property = s['oos_validation']['property']

        if 'targets' in s:
            self.target_file = s['targets']['file']
            if 'property' in s['targets']:
                self.target_property = s['targets']['property']
            self.resample = None
            if 'resample' in s['targets']:
                self.resample = s['targets']['resample']
                self.value_resampling_args = None
                self.spatial_resampling_args = None
                if 'value' in s['targets']['resample']:
                    self.value_resampling_args = s['targets']['resample']['value']
                if 'spatial' in s['targets']['resample']:
                    self.spatial_resampling_args = s['targets']['resample']['spatial']
                if self.value_resampling_args is None and self.spatial_resampling_args is None:
                    raise ValueError("provide at least one of value or spatial args for resampling")

            if 'group_targets' in s['targets']:
                self.group_targets = True
                self.groups_eps = s['targets']['group_targets']['groups_eps']
                if 'group_col' in s['targets']['group_targets']:
                    self.group_col = s['targets']['group_targets']['group_col']
                else:
                    self.group_col = None
                self.target_groups_file = path.join(self.output_dir, 'target_groups.jpg')
            else:
                self.group_targets = False
            self.target_groups_file = path.join(self.output_dir, 'target_groups.jpg')
            if 'weight_col_name' in s['targets']:
                self.weighted_model = True
                self.weight_col_name = s['targets']['weight_col_name']
            else:
                self.weighted_model = False

            if 'group' in s['targets']:
                self.output_group_col_name = s['targets']['group']['output_group_col_name']
                if 'spatial' in s['targets']['group']:
                    self.spatial_grouping_args = s['targets']['group']['spatial']
                else:
                    self.spatial_grouping_args = {}
                if 'fields_to_keep' in s['targets']['group']:
                    self.grouping_fields_to_keep = s['targets']['group']['fields_to_keep']
                else:
                    self.grouping_fields_to_keep = []
                self.grouped_output = Path(self.output_dir).joinpath(s['output']['grouped_shapefile'])

            if 'split' in s['targets']:
                self.split_group_col_name = s['targets']['split']['group_col_name']
                self.split_oos_fraction = s['targets']['split']['oos_fraction']
                self.train_shapefile = Path(self.output_dir).joinpath(s['output']['train_shapefile'])
                self.oos_shapefile = Path(self.output_dir).joinpath(s['output']['oos_shapefile'])

        self.mask = None
        if 'mask' in s:
            self.mask = s['mask']['file']
            self.retain = s['mask']['retain']  # mask areas that are predicted

        if 'pca' in s:
            self.pca = True
            preprocessing_transforms = s['preprocessing']['transforms']
            if 'n_components' in preprocessing_transforms[0]['whiten']:
                self.n_components = preprocessing_transforms[0]['whiten']['n_components']
            else:
                self.n_components = None
            if 'geotif' not in s['pca']:
                tif_opts = {}
            else:
                tif_opts = s['pca']['geotif'] if s['pca']['geotif'] is not None else {}
            self.geotif_options = tif_opts
            self.pca_json = path.join(self.output_dir, s['output']['pca_json'])
        else:
            self.pca = False

        self.lon_lat = False
        if 'lon_lat' in s:
            self.lon_lat = True
            self.lat = s['lon_lat']['lat']
            self.lon = s['lon_lat']['lon']

        # TODO pipeline this better
        self.rank_features = False
        self.permutation_importance = False
        self.feature_importance = False
        self.cross_validate = False
        self.parallel_validate = False
        if 'validation' in s:
            for i in s['validation']:
                if i == 'feature_rank':
                    self.rank_features = True
                if i == 'permutation_importance':
                    self.permutation_importance = True
                if i == 'feature_importance':
                    self.feature_importance = True
                if i == 'parallel':
                    self.parallel_validate = True
                if type(i) is dict and 'k-fold' in i:
                    self.cross_validate = True
                    self.folds = i['k-fold']['folds']
                    self.crossval_seed = i['k-fold']['random_seed']
                    break

        if self.rank_features and self.pickle_load:
            self.pickle_load = False
            log.info('Feature ranking does not work with '
                     'pickled files. Pickled files will not be used. '
                     'All covariates will be intersected.')

        self.optimised_model = False
        if 'learning' in s:
            self.hpopt = False
            self.skopt = False
            if 'optimisation' in s['learning']:
                if 'searchcv_params' in s['learning']['optimisation']:
                    self.opt_searchcv_params = s['learning']['optimisation']['searchcv_params']
                    self.opt_params_space = s['learning']['optimisation']['params_space']
                    self.skopt = True
                if 'hyperopt_params' in s['learning']['optimisation']:
                    self.hyperopt_params = s['learning']['optimisation']['hyperopt_params']
                    self.hp_params_space = s['learning']['optimisation']['hp_params_space']
                    self.hpopt = True

                if self.skopt and self.hpopt:
                    raise ConfigException("Only one of searchcv_params or hyperopt_params can be specified")

        self.cluster_analysis = False
        self.clustering = False
        if 'clustering' in s:
            self.clustering = True
            self.clustering_algorithm = s['clustering']['algorithm']
            cluster_args = s['clustering']['arguments']
            self.n_classes = cluster_args['n_classes']
            self.oversample_factor = cluster_args['oversample_factor']
            if 'file' in s['clustering'] and s['clustering']['file']:
                self.semi_supervised = True
                self.class_file = s['clustering']['file']
                self.class_property = s['clustering']['property']
            else:
                self.semi_supervised = False
            if 'cluster_analysis' in s['clustering']:
                self.cluster_analysis = s['clustering']['cluster_analysis']

        output_model = s['output']['model'] if 'model' in s['output'] \
            else self.name + ('.cluster' if self.clustering else '.model')

        self.model_file = Path(self.output_dir).joinpath(output_model)
        self.resampled_output = Path(self.output_dir).joinpath(Path(self.target_file).stem + '_resampled.shp')
        self.optimisation_output_skopt = Path(self.output_dir).joinpath(self.name + '_optimisation_skopt.csv')
        self.optimisation_output_hpopt = Path(self.output_dir).joinpath(self.name + '_optimisation_hpopt.csv')
        self.optimised_model_params = Path(self.output_dir).joinpath(self.name + "_optimised_params.json")
        self.optimised_model_file = Path(self.output_dir).joinpath(self.name + "_optimised.model")
        self.outfile_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
        self.optimised_model_scores = Path(self.output_dir).joinpath(self.name + "_optimised_scores.json")
Exemplo n.º 3
0
    def __init__(self,
                 yaml_file,
                 clustering=False,
                 learning=False,
                 resampling=False,
                 predicting=False,
                 shiftmap=True):
        def _grp(d, k, msg=None):
            """
            Get required parameter.
            """
            try:
                return d[k]
            except KeyError:
                if msg is None:
                    msg = f"Required parameter {k} not present in config."
                _logger.exception(msg)
                raise

        Config._configure_pyyaml()
        with open(yaml_file, 'r') as f:
            try:
                s = yaml.load(f, Loader=Config.yaml_loader)
            except UnicodeDecodeError:
                if yaml_file.endswith('.model'):
                    _logger.error(
                        "You're attempting to run uncoverml but have provided the "
                        "'.model' file instead of the '.yaml' config file. The predict "
                        "now requires the configuration file and not the model. Please "
                        "try rerunning the command with the configuration file."
                    )
                else:
                    _logger.error(
                        "Couldn't parse the yaml file. Ensure you've provided the correct "
                        "file as config file and that the YAML is valid.")
        self.name = path.basename(yaml_file).rsplit(".", 1)[0]

        if clustering:
            # CLUSTERING BLOCK
            cb = _grp(s, 'clustering',
                      "'clustering' block must be provided when clustering.")
            self.clustering = True
            self.algorithm = cb.get('algorithm', 'kmeans')
            self.n_classes = _grp(
                cb, 'n_classes',
                "'n_classes' must be provided when clustering.")
            self.oversample_factor = _grp(
                cb, 'oversample_factor',
                "'oversample_factor' must be provided when clustering.")
            self.cluster_analysis = cb.get('cluster_analysis', False)
            self.class_file = cb.get('file')
            if self.class_file:
                self.class_property = _grp(
                    cb, 'property', "'property' must be provided when "
                    "providing a file for semisupervised clustering.")
            self.semi_supervised = self.class_file is not None
        elif learning:
            # LEARNING BLOCK
            learn_block = _grp(s, 'learning')
            self.clustering = False
            self.cluster_analysis = False
            self.target_search = learn_block.get('target_search', False)
            self.targetsearch_threshold = learn_block.get(
                'target_search_threshold', 0.8)
            tsexb = learn_block.get('target_search_extents')
            self.targetsearch_extents, self.tse_are_pixel_coordinates = Config.parse_extents(
                tsexb)
            self.algorithm = _grp(
                learn_block, 'algorithm',
                "'algorithm' must be provided as part of 'learning' block.")
            self.algorithm_args = learn_block.get('arguments', {})
        else:
            self.bootstrap = False
            self.algorithm = None
            self.clustering = False
            self.cluster_analysis = False
            self.target_search = False

        self.set_algo_flags()

        # EXTENTS
        exb = s.get('extents')
        self.extents, self.extents_are_pixel_coordinates = Config.parse_extents(
            exb)

        _logger.debug("loaded crop box %s", self.extents)

        # PICKLING BLOCK
        pk_block = s.get('pickling')
        if pk_block:
            self.pk_covariates = pk_block.get('covariates')
            self.pk_targets = pk_block.get('targets')

            # Load from pickle files if covariates and targets exist.
            self.pk_load = self.pk_covariates and os.path.exists(self.pk_covariates) \
                           and self.pk_targets and os.path.exists(self.pk_targets)

            if self.cubist or self.multicubist:
                self.pk_featurevec = pk_block.get('featurevec')
                # If running multicubist, we also need featurevec to load from pickle files.
                self.pk_load = self.pk_load \
                               and self.pk_featurevec and os.path.exists(self.pk_featurevec)
        else:
            self.pk_load = False
            self.pk_covariates = None
            self.pk_targets = None
            self.pk_featurevec = None

        # FEATURES BLOCK
        # Todo: fix get_image_spec so features are optional if using pickled data.
        # if not self.pk_load:
        if not resampling:
            _logger.warning(
                "'features' are required even when loading from pickled data - this "
                "is a work around for getting image specifications. Needs to be fixed."
            )
            features = _grp(
                s, 'features',
                "'features' block must be provided when not loading "
                "from pickled data.")
            print("loading features")
            self.feature_sets = [FeatureSetConfig(f) for f in features]
            # Mixing tabular and image features not currently supported
            if any(f.tabular for f in self.feature_sets):
                self.tabular_prediction = True
                if not all(f.tabular for f in self.feature_sets):
                    raise ValueError(
                        "Mixing tabular and image features not currently supported. Ensure "
                        "features are only sourced from 'files' or 'table' but not both."
                    )
            else:
                self.tabular_prediction = False

        # Not yet implemented.
        if 'patchsize' in s:
            _logger.info("Patchsize currently fixed at 0 -- ignoring")
        self.patchsize = 0

        # TARGET BLOCK
        if (not predicting and not clustering) or shiftmap:
            tb = _grp(
                s, 'targets',
                "'targets' block must be provided when not loading from "
                "pickled data.")
            self.target_file = _grp(
                tb, 'file', "'file' needs to be provided when specifying "
                "targets.")
            if not path.exists(self.target_file):
                raise FileNotFoundError(
                    "Target shapefile provided in config does not exist. Check "
                    "that the 'file' property of the 'targets' block is correct."
                )
            self.target_property = _grp(
                tb, 'property', "'property needs to be provided when "
                "specifying targets.")
            self.target_drop_values = tb.get('drop', None)
            self.target_weight_property = tb.get('weight_property')
            self.fields_to_write_to_csv = tb.get('write_to_csv')
            self.shiftmap_targets = tb.get('shiftmap')
            rb = tb.get('resample')
            if rb:
                self.spatial_resampling_args = rb.get('spatial')
                self.value_resampling_args = rb.get('value')
                if not (self.spatial_resampling_args
                        or self.value_resampling_args):
                    raise ValueError(
                        "At least one of 'spatial' or 'value' resampling parameters "
                        "must be provided when resampling.")

        # FINAL TRANSFORM BLOCK
        ftb = s.get('final_transform')
        if ftb is not None:
            _, im, trans_g = _parse_transform_set(ftb.get('transforms'),
                                                  ftb.get('imputation'))
            self.final_transform = transforms.TransformSet(im, trans_g)
        else:
            self.final_transform = None

        # VALIDATION BLOCK
        vb = s.get('validation')
        if vb:
            oos = vb.get('out_of_sample')
            if oos:
                self.oos_percentage = oos.get('percentage', None)
                self.oos_shapefile = oos.get('shapefile', None)
                self.oos_property = oos.get('property', None)
            self.out_of_sample_validation = oos is not None
            self.rank_features = vb.get('feature_rank', False)
            if self.pk_load and self.rank_features:
                _logger.warning(
                    "Feature ranking cannot be performed when loading covariates and "
                    "targets from pickled data.")
                self.rank_features = False
            self.permutation_importance = vb.get('permutation_importance',
                                                 False)
            kfb = vb.get('k-fold')
            if kfb:
                self.folds = _grp(
                    kfb, 'folds',
                    "'folds' (number of folds) must be specified "
                    "if k-fold cross validation and/or feature ranking is being used."
                )
                self.crossval_seed = _grp(
                    kfb, 'random_seed', "'random_seed' must be specified "
                    "if k-fold cross validation and/or feature ranking is "
                    "being used.")
                self.parallel_validate = kfb.get('parallel', False)
            elif self.rank_features:
                # Feature ranking requires crossval params. Provide defaults if not available.
                self.folds = 5
                self.crossval_seed = 1
                self.parallel_validate = False
            self.cross_validate = kfb is not None
        else:
            self.rank_features = False
            self.permutation_importance = False
            self.parallel_validate = False
            self.out_of_sample_validation = False
            self.cross_validate = False

        # OPTIMISATION BLOCK
        # Note: optimisation options get parsed in scripts/gridsearch.py
        self.optimisation = s.get('optimisation')

        # PREDICT BLOCK
        if predicting:
            pb = _grp(s, 'prediction', "'prediction' block must be provided.")
            self.geotif_options = pb.get('geotif', {})
            self.quantiles = _grp(
                pb, 'quantiles', "'quantiles' must be provided as part of "
                "prediction block.")
            self.outbands = _grp(
                pb, 'outbands',
                "'outbands' must be provided as part of prediction "
                "block.")
            self.thumbnails = pb.get('thumbnails', 10)
            self.bootstrap_predictions = pb.get('bootstrap')
            mb = s.get('mask')
            if mb:
                self.mask = mb.get('file')
                if not os.path.exists(self.mask):
                    raise FileNotFoundError(
                        "Mask file provided in config does not exist. Check that "
                        "the 'file' property of the 'mask' block is correct.")
                self.retain = _grp(
                    mb, 'retain', "'retain' must be provided if providing a "
                    "prediction mask.")
            else:
                self.mask = None

            if self.krige:
                # Todo: don't know if lon/lat is compulsory or not for kriging
                self.lon_lat = s.get('lon_lat')
            else:
                self.lon_lat = None

        # OUTPUT BLOCK
        def _outpath(filename):
            return os.path.join(self.output_dir, self.name + f'{filename}')

        ob = _grp(s, 'output', "'output' block is required.")
        self.output_dir = _grp(ob, 'directory',
                               "'directory' for output is required.")
        self.model_file = ob.get('model', _outpath('.model'))

        if ob.get('plot_feature_ranks', False):
            self.plot_feature_ranks = _outpath('_featureranks.png')
            self.plot_feature_rank_curves = _outpath('_featurerank_curves.png')
        else:
            self.plot_feature_ranks = None
            self.plot_feature_rank = None

        if ob.get('plot_intersection', False):
            self.plot_intersection = _outpath('_intersected.png')
        else:
            self.plot_intersection = None

        if ob.get('plot_real_vs_pred', False):
            self.plot_real_vs_pred = _outpath('_real_vs_pred.png')
            self.plot_residual = _outpath('_residual.png')
        else:
            self.plot_real_vs_pred = None
            self.plot_residual = None

        if ob.get('plot_correlation', False):
            self.plot_correlation = _outpath('_correlation.png')
        else:
            self.plot_correlation = None

        if ob.get('plot_target_scaling', False):
            self.plot_target_scaling = _outpath('_target_scaling.png')
        else:
            self.plot_target_scaling = None

        self.raw_covariates = _outpath('_rawcovariates.csv')
        self.raw_covariates_mask = _outpath('_rawcovariates_mask.csv')

        self.feature_ranks_file = _outpath('_featureranks.json')

        self.crossval_scores_file = _outpath('_crossval_scores.json')
        self.crossval_results_file = _outpath('_crossval_results.csv')
        self.crossval_results_plot = _outpath('_crossval_results.png')
        self.oos_scores_file = _outpath('_oos_scores.json')
        self.oos_results_file = _outpath('_oos_results.csv')
        self.oos_targets_file = _outpath('_oos_targets.shp')

        self.dropped_targets_file = _outpath('_dropped_targets.txt')
        self.transformed_targets_file = _outpath('_transformed_targets.csv')

        self.metadata_file = _outpath('_metadata.txt')

        self.optimisation_results_file = _outpath('_optimisation.csv')

        self.prediction_file = _outpath('_{}.tif')
        self.prediction_shapefile = _outpath('_prediction')
        self.prediction_prjfile = _outpath('_prediction.prj')

        self.shiftmap_file = _outpath('_shiftmap_{}.tif')
        self.shiftmap_points = _outpath('_shiftmap_generated_points.csv')

        self.targetsearch_generated_points = _outpath(
            '_targetsearch_generated_points.csv')
        self.targetsearch_likelihood = _outpath('_targetsearch_likelihood.csv')
        self.targetsearch_result_data = _outpath('_targetsearch_result.pk')

        self.resampled_shapefile_dir = os.path.join(self.output_dir,
                                                    '{}_resampled')

        paths = [self.output_dir, os.path.split(self.model_file)[0]]
        for p in paths:
            if p:
                makedirs(p, exist_ok=True)

        self._tmpdir = None
Exemplo n.º 4
0
    def __init__(self, yaml_file):
        with open(yaml_file, 'r') as f:
            s = yaml.load(f)
        self.name = path.basename(yaml_file).rsplit(".", 1)[0]

        # TODO expose this option when fixed
        if 'patchsize' in s:
            log.info("Patchsize currently fixed at 0 -- ignoring")
        self.patchsize = 0

        self.algorithm = s['learning']['algorithm']
        self.cubist = self.algorithm == 'cubist'
        self.multicubist = self.algorithm == 'multicubist'
        self.multirandomforest = self.algorithm == 'multirandomforest'
        self.krige = self.algorithm == 'krige'
        self.algorithm_args = s['learning']['arguments']
        self.quantiles = s['prediction']['quantiles']
        self.outbands = None
        if 'outbands' in s['prediction']:
            self.outbands = s['prediction']['outbands']
        self.thumbnails = s['prediction']['thumbnails'] \
            if 'thumbnails' in s['prediction'] else None

        self.pickle = any(True for d in s['features'] if d['type'] == 'pickle')

        self.rawcovariates = False
        if self.pickle:
            self.pickle_load = True
            for n, d in enumerate(s['features']):
                if d['type'] == 'pickle':
                    self.pickled_covariates = \
                        path.abspath(d['files']['covariates'])
                    self.pickled_targets = d['files']['targets']
                    if 'rawcovariates' in d['files']:
                        self.rawcovariates = d['files']['rawcovariates']
                        self.rawcovariates_mask = \
                            d['files']['rawcovariates_mask']
                    if not (path.exists(d['files']['covariates'])
                            and path.exists(d['files']['targets'])):
                        self.pickle_load = False
                    if self.cubist or self.multicubist:
                        self.featurevec = \
                            path.abspath(d['files']['featurevec'])
                        if not path.exists(d['files']['featurevec']):
                            self.pickle_load = False
                    if 'plot_covariates' in d['files']:
                        self.plot_covariates = d['files']['plot_covariates']
                    else:
                        self.plot_covariates = False
                    s['features'].pop(n)
        else:
            self.pickle_load = False

        if not self.pickle_load:
            log.info('One or both pickled files were not '
                     'found. All targets will be intersected.')

        self.feature_sets = [FeatureSetConfig(k) for k in s['features']]

        if 'preprocessing' in s:
            final_transform = s['preprocessing']
            _, im, trans_g = _parse_transform_set(
                final_transform['transforms'], final_transform['imputation'])
            self.final_transform = transforms.TransformSet(im, trans_g)
        else:
            self.final_transform = None

        self.target_file = s['targets']['file']
        self.target_property = s['targets']['property']

        self.resample = None

        if 'resample' in s['targets']:
            self.resample = s['targets']['resample']

        self.mask = None
        if 'mask' in s:
            self.mask = s['mask']['file']
            self.retain = s['mask']['retain']  # mask areas that are predicted

        self.lon_lat = False
        if 'lon_lat' in s:
            self.lon_lat = True
            self.lat = s['lon_lat']['lat']
            self.lon = s['lon_lat']['lon']

        # TODO pipeline this better
        self.rank_features = False
        self.cross_validate = False
        self.parallel_validate = False
        if s['validation']:
            for i in s['validation']:
                if i == 'feature_rank':
                    self.rank_features = True
                if i == 'parallel':
                    self.parallel_validate = True
                if type(i) is dict and 'k-fold' in i:
                    self.cross_validate = True
                    self.folds = i['k-fold']['folds']
                    self.crossval_seed = i['k-fold']['random_seed']
                    break

        if self.rank_features and self.pickle_load:
            self.pickle_load = False
            log.info('Feature ranking does not work with '
                     'pickled files. Pickled files will not be used. '
                     'All covariates will be intersected.')

        self.output_dir = s['output']['directory']

        # create output dir if does not exist
        makedirs(self.output_dir, exist_ok=True)

        if 'optimisation' in s:
            self.optimisation = s['optimisation']
            if 'optimisation_output' in self.optimisation:
                self.optimisation_output = \
                    self.optimisation['optimisation_output']

        self.cluster_analysis = False
        if 'clustering' in s:
            self.clustering_algorithm = s['clustering']['algorithm']
            cluster_args = s['clustering']['arguments']
            self.n_classes = cluster_args['n_classes']
            self.oversample_factor = cluster_args['oversample_factor']
            if 'file' in s['clustering'] and s['clustering']['file']:
                self.semi_supervised = True
                self.class_file = s['clustering']['file']
                self.class_property = s['clustering']['property']
            else:
                self.semi_supervised = False
            if 'cluster_analysis' in s['clustering']:
                self.cluster_analysis = s['clustering']['cluster_analysis']