""" Author(s): Carson Schubert ([email protected]) Date Created: 04/05/2019 Utility module for managing ravenml's configuration. """ import yaml from copy import deepcopy from pathlib import Path from ravenml.utils.local_cache import RMLCache from click.exceptions import BadParameter config_cache = RMLCache() # required configuration fields CONFIG_FIELDS = sorted( ['image_bucket_name', 'dataset_bucket_name', 'model_bucket_name']) def get_config() -> dict: """Retrieves the current configuration. Returns: dict: current configuration Raises: ValueError: If a required field is missing or an invalid field is found. FileNotFoundError: If a configuration file is not found. """ config = {}
""" import pytest import os import re from pathlib import Path from click.testing import CliRunner from ravenml.cli import cli from ravenml.utils.local_cache import RMLCache ### SETUP ### runner = CliRunner() test_dir = Path(os.path.dirname(__file__)) test_data_dir = test_dir / Path('data') ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]') test_cache = RMLCache() def setup_module(): """ Sets up the module for testing. """ test_cache.path = test_dir / '.testing' def teardown_module(): """ Tears down the module after testing. """ test_cache.clean() ### TESTS ###
Author(s): Carson Schubert ([email protected]) Date Created: 02/23/2019 Main CLI entry point for ravenml. """ import click from colorama import init, Fore from ravenml.train.commands import train from ravenml.data.commands import data from ravenml.config.commands import config from ravenml.utils.config import get_config, update_config from ravenml.utils.local_cache import RMLCache init() cache = RMLCache() ### OPTIONS ### clean_all_opt = click.option( '-a', '--all', is_flag=True, help='Clear all cache contents, including saved ravenML configuration.') ### COMMANDS ### @click.group(help='Welcome to ravenML!') def cli(): """ Top level command group for ravenml. """ pass
""" Author(s): Carson Schubert ([email protected]) Date Created: 03/13/2019 Utility module for managing Jigsaw created datasets. """ import json import boto3 from pathlib import Path from botocore.exceptions import ClientError from ravenml.utils.local_cache import RMLCache from ravenml.utils.config import get_config from ravenml.utils.aws import list_top_level_bucket_prefixes imageset_cache = RMLCache('imagesets') # name of config field BUCKET_FIELD = 'image_bucket_name' ### PUBLIC METHODS ### def get_imageset_names() -> list: """Retrieves the names of all available imagesets in bucket pointed to by global config. Returns: list: imageset names """ config = get_config() return list_top_level_bucket_prefixes(config[BUCKET_FIELD])
def __init__(self, config: dict = None, plugin_name: str = None): if config is None or plugin_name is None: raise click.exceptions.UsageError( ('You must provide the --config option ' 'on `ravenml create` when using this plugin command.')) self.config = config ## Set up Local Cache # currently the cache_name subdir is only created IF the plugin places files there self.imageset_cache = RMLCache() ## Set up Artifact Path dp = config.get('dataset_path') if dp is None: self.imageset_cache.ensure_subpath_exists('datasets') self.dataset_path = Path(self.imageset_cache.path / 'datasets') else: dp = Path(os.path.expanduser(dp)) # check if local path contains data if os.path.exists(dp) and os.path.isdir(dp) and len( os.listdir(dp)) > 0: if config.get('overwrite_local') or user_confirms( 'Local artifact storage location contains old data. Overwrite?' ): shutil.rmtree(dp) else: click.echo(Fore.RED + 'Dataset creation cancelled.') click.get_current_context().exit() # create directory, need exist_ok since we only delete # if directory contains files # TODO: protect against paths to actual files os.makedirs(dp, exist_ok=True) self.dataset_path = dp ## Set up Imageset # s3 download imagesets if not config.get('local'): imageset_list = config.get('imageset') imageset_options = get_imageset_names() # prompt for imagesets if not provided if imageset_list is None: imageset_list = user_selects('Choose imagesets:', imageset_options, selection_type="checkbox") else: for imageset in imageset_list: if imageset not in imageset_options: hint = 'imageset name, no such imageset exists on S3' raise click.exceptions.BadParameter( imageset, param=imageset_list, param_hint=hint) ## Download imagesets self.imageset_cache.ensure_subpath_exists('imagesets') self.imageset_paths = [] self.download_imagesets(imageset_list) # local imagesets else: imageset_paths = config.get('imageset') imageset_list = [] if imageset_paths is None: raise click.exceptions.BadParameter( config, param=config, param_hint='config, no "imageset" filepaths. Config was') for imageset in imageset_paths: if not os.path.isdir(imageset): raise click.exceptions.BadParameter( config, param=config, param_hint='config, invalid "imageset" path: ' + imageset + ' Config was') if os.path.basename(imageset): imageset_list.append(os.path.basename(imageset)) self.imageset_paths = [ Path(imageset_path) for imageset_path in imageset_paths ] ## Set up Basic Metadata # TODO: add environment description, git hash, etc self.metadata = config.get('metadata', {}) # handle user defined metadata fields if not self.metadata.get('created_by'): self.metadata['created_by'] = user_input( 'Please enter your first and last name:') if not self.metadata.get('comments'): self.metadata['comments'] = user_input( 'Please enter descriptive comments about this training:') # handle automatic metadata fields self.metadata['date_started_at'] = datetime.utcnow().isoformat() + "Z" self.metadata[ 'imagesets_used'] = imageset_list if imageset_list else self.imageset_paths # handle non-metadata user defined fields self.kfolds = config['kfolds'] if config.get('kfolds') else 0 self.test_percent = config['test_percent'] if config.get( 'test_percent') else .2 # Initialize Directory for Dataset self.metadata['dataset_name'] = config['dataset_name'] if config.get( 'dataset_name') else user_input( message="What would you like to name this dataset?") dir_name = self.dataset_path / self.metadata['dataset_name'] if os.path.isdir(dir_name): if config.get('overwrite_local') or user_confirms( 'Local artifact storage location contains old data. Overwrite?' ): print("WARNING: Deleting existing dataset in cache") shutil.rmtree(dir_name) os.mkdir(dir_name) else: click.echo(Fore.RED + 'Dataset creation cancelled.') click.get_current_context().exit() else: os.mkdir(dir_name) ## Set up fields for plugin use # NOTE: plugins should overwrite the architecture field to something # more specific/useful since it is used to name the final uploaded model self.metadata[plugin_name] = {'architecture': plugin_name} # plugins should only ACCESS the plugin_metadata attibute and add items. They should # NEVER assign to the attribute as it will break the reference to the overall metadata dict self.plugin_metadata = self.metadata[plugin_name] if not config.get('plugin'): raise click.exceptions.BadParameter( config, param=config, param_hint='config, no "plugin" field. Config was') else: self.plugin_config = config.get('plugin') # Set up what should be done after dataset creation self.upload = config["upload"] if 'upload' in config.keys( ) else user_confirms( message="Would you like to upload the dataset to S3?") self.delete_local = config[ "delete_local"] if 'delete_local' in config.keys( ) else user_confirms(message="Would you like to delete your " + self.metadata['dataset_name'] + " dataset?")
class CreateInput(object): """Represents a dataset creation input. Contains all plugin-independent information necessary for training. Plugins can define their own behavior for getting additional information. Variables: config (dict): all config fields supplied by user plugin_cache (RMLCache): cache where plugin can create temp files, and where datasets are stored locally by default imageset_cache (RMLCache): cache that stores imagesets locally dataset_path (Path): path to where dataset should be written to imageset_paths (list): list of paths to imagesets being used metadata (dict): holds dataset metadata, currently: created_by, comments, dataset_name, date_started_at, imagesets_used, plugin_metadata plugin_metadata (dict): holds plugin metadata, currently: plugin_name kfolds (int): number of folds user wants in dataset test_percent (float): percentage of data should be in test set upload (bool): whether the user wants to upload to s3 or not delete_local (bool): whether the user wants to delete the local dataset or not """ def __init__(self, config: dict = None, plugin_name: str = None): if config is None or plugin_name is None: raise click.exceptions.UsageError( ('You must provide the --config option ' 'on `ravenml create` when using this plugin command.')) self.config = config ## Set up Local Cache # currently the cache_name subdir is only created IF the plugin places files there self.imageset_cache = RMLCache() ## Set up Artifact Path dp = config.get('dataset_path') if dp is None: self.imageset_cache.ensure_subpath_exists('datasets') self.dataset_path = Path(self.imageset_cache.path / 'datasets') else: dp = Path(os.path.expanduser(dp)) # check if local path contains data if os.path.exists(dp) and os.path.isdir(dp) and len( os.listdir(dp)) > 0: if config.get('overwrite_local') or user_confirms( 'Local artifact storage location contains old data. Overwrite?' ): shutil.rmtree(dp) else: click.echo(Fore.RED + 'Dataset creation cancelled.') click.get_current_context().exit() # create directory, need exist_ok since we only delete # if directory contains files # TODO: protect against paths to actual files os.makedirs(dp, exist_ok=True) self.dataset_path = dp ## Set up Imageset # s3 download imagesets if not config.get('local'): imageset_list = config.get('imageset') imageset_options = get_imageset_names() # prompt for imagesets if not provided if imageset_list is None: imageset_list = user_selects('Choose imagesets:', imageset_options, selection_type="checkbox") else: for imageset in imageset_list: if imageset not in imageset_options: hint = 'imageset name, no such imageset exists on S3' raise click.exceptions.BadParameter( imageset, param=imageset_list, param_hint=hint) ## Download imagesets self.imageset_cache.ensure_subpath_exists('imagesets') self.imageset_paths = [] self.download_imagesets(imageset_list) # local imagesets else: imageset_paths = config.get('imageset') imageset_list = [] if imageset_paths is None: raise click.exceptions.BadParameter( config, param=config, param_hint='config, no "imageset" filepaths. Config was') for imageset in imageset_paths: if not os.path.isdir(imageset): raise click.exceptions.BadParameter( config, param=config, param_hint='config, invalid "imageset" path: ' + imageset + ' Config was') if os.path.basename(imageset): imageset_list.append(os.path.basename(imageset)) self.imageset_paths = [ Path(imageset_path) for imageset_path in imageset_paths ] ## Set up Basic Metadata # TODO: add environment description, git hash, etc self.metadata = config.get('metadata', {}) # handle user defined metadata fields if not self.metadata.get('created_by'): self.metadata['created_by'] = user_input( 'Please enter your first and last name:') if not self.metadata.get('comments'): self.metadata['comments'] = user_input( 'Please enter descriptive comments about this training:') # handle automatic metadata fields self.metadata['date_started_at'] = datetime.utcnow().isoformat() + "Z" self.metadata[ 'imagesets_used'] = imageset_list if imageset_list else self.imageset_paths # handle non-metadata user defined fields self.kfolds = config['kfolds'] if config.get('kfolds') else 0 self.test_percent = config['test_percent'] if config.get( 'test_percent') else .2 # Initialize Directory for Dataset self.metadata['dataset_name'] = config['dataset_name'] if config.get( 'dataset_name') else user_input( message="What would you like to name this dataset?") dir_name = self.dataset_path / self.metadata['dataset_name'] if os.path.isdir(dir_name): if config.get('overwrite_local') or user_confirms( 'Local artifact storage location contains old data. Overwrite?' ): print("WARNING: Deleting existing dataset in cache") shutil.rmtree(dir_name) os.mkdir(dir_name) else: click.echo(Fore.RED + 'Dataset creation cancelled.') click.get_current_context().exit() else: os.mkdir(dir_name) ## Set up fields for plugin use # NOTE: plugins should overwrite the architecture field to something # more specific/useful since it is used to name the final uploaded model self.metadata[plugin_name] = {'architecture': plugin_name} # plugins should only ACCESS the plugin_metadata attibute and add items. They should # NEVER assign to the attribute as it will break the reference to the overall metadata dict self.plugin_metadata = self.metadata[plugin_name] if not config.get('plugin'): raise click.exceptions.BadParameter( config, param=config, param_hint='config, no "plugin" field. Config was') else: self.plugin_config = config.get('plugin') # Set up what should be done after dataset creation self.upload = config["upload"] if 'upload' in config.keys( ) else user_confirms( message="Would you like to upload the dataset to S3?") self.delete_local = config[ "delete_local"] if 'delete_local' in config.keys( ) else user_confirms(message="Would you like to delete your " + self.metadata['dataset_name'] + " dataset?") @cli_spinner_wrapper("Downloading imagesets from S3...") def download_imagesets(self, imageset_list): """Util for downloading all imagesets needed for imageset creation. Args: imageset_list (list): list of imageset names needed """ # Get image bucket name bucketConfig = get_config() image_bucket_name = bucketConfig.get('image_bucket_name') # Downloads each imageset and appends local path to 'self.imageset_paths' for imageset in imageset_list: imageset_path = 'imagesets/' self.imageset_cache.ensure_subpath_exists(imageset_path) download_prefix(image_bucket_name, imageset, self.imageset_cache, imageset_path) self.imageset_paths.append(self.imageset_cache.path / 'imagesets' / imageset)
Author(s): Carson Schubert ([email protected]) Date Created: 03/13/2019 Utility module for managing Jigsaw created datasets. """ import json import boto3 from botocore.exceptions import ClientError from pathlib import Path from ravenml.utils.local_cache import RMLCache from ravenml.utils.config import get_config from ravenml.utils.aws import list_top_level_bucket_prefixes, download_prefix from ravenml.data.interfaces import Dataset dataset_cache = RMLCache('datasets') # name of dataset bucket field inside config dict BUCKET_FIELD = 'dataset_bucket_name' ### PUBLIC METHODS ### def get_dataset_names() -> list: """Retrieves the names of all available datasets in bucket pointed to by global config. Returns: list: dataset names """ config = get_config() return list_top_level_bucket_prefixes(config[BUCKET_FIELD])