示例#1
0
"""
Author(s):      Carson Schubert ([email protected])
Date Created:   04/05/2019

Utility module for managing ravenml's configuration.
"""

import yaml
from copy import deepcopy
from pathlib import Path
from ravenml.utils.local_cache import RMLCache
from click.exceptions import BadParameter

config_cache = RMLCache()

# required configuration fields
CONFIG_FIELDS = sorted(
    ['image_bucket_name', 'dataset_bucket_name', 'model_bucket_name'])


def get_config() -> dict:
    """Retrieves the current configuration.
    
    Returns:
        dict: current configuration

    Raises:
        ValueError: If a required field is missing or an invalid field is found.
        FileNotFoundError: If a configuration file is not found.
    """
    config = {}
示例#2
0
"""

import pytest
import os
import re
from pathlib import Path
from click.testing import CliRunner
from ravenml.cli import cli
from ravenml.utils.local_cache import RMLCache

### SETUP ###
runner = CliRunner()
test_dir = Path(os.path.dirname(__file__))
test_data_dir = test_dir / Path('data')
ansi_escape = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]')
test_cache = RMLCache()


def setup_module():
    """ Sets up the module for testing.
    """
    test_cache.path = test_dir / '.testing'


def teardown_module():
    """ Tears down the module after testing.
    """
    test_cache.clean()


### TESTS ###
示例#3
0
Author(s):      Carson Schubert ([email protected])  
Date Created:   02/23/2019

Main CLI entry point for ravenml.
"""

import click
from colorama import init, Fore
from ravenml.train.commands import train
from ravenml.data.commands import data
from ravenml.config.commands import config
from ravenml.utils.config import get_config, update_config
from ravenml.utils.local_cache import RMLCache

init()
cache = RMLCache()

### OPTIONS ###
clean_all_opt = click.option(
    '-a',
    '--all',
    is_flag=True,
    help='Clear all cache contents, including saved ravenML configuration.')


### COMMANDS ###
@click.group(help='Welcome to ravenML!')
def cli():
    """ Top level command group for ravenml.
    """
    pass
示例#4
0
"""
Author(s):      Carson Schubert ([email protected])  
Date Created:   03/13/2019

Utility module for managing Jigsaw created datasets.
"""

import json
import boto3
from pathlib import Path
from botocore.exceptions import ClientError
from ravenml.utils.local_cache import RMLCache
from ravenml.utils.config import get_config
from ravenml.utils.aws import list_top_level_bucket_prefixes

imageset_cache = RMLCache('imagesets')
# name of config field
BUCKET_FIELD = 'image_bucket_name'


### PUBLIC METHODS ###
def get_imageset_names() -> list:
    """Retrieves the names of all available imagesets in bucket pointed to by global config.

    Returns:
        list: imageset names
    """
    config = get_config()
    return list_top_level_bucket_prefixes(config[BUCKET_FIELD])

示例#5
0
    def __init__(self, config: dict = None, plugin_name: str = None):

        if config is None or plugin_name is None:
            raise click.exceptions.UsageError(
                ('You must provide the --config option '
                 'on `ravenml create` when using this plugin command.'))

        self.config = config

        ## Set up Local Cache
        # currently the cache_name subdir is only created IF the plugin places files there
        self.imageset_cache = RMLCache()

        ## Set up Artifact Path
        dp = config.get('dataset_path')
        if dp is None:
            self.imageset_cache.ensure_subpath_exists('datasets')
            self.dataset_path = Path(self.imageset_cache.path / 'datasets')
        else:
            dp = Path(os.path.expanduser(dp))
            # check if local path contains data
            if os.path.exists(dp) and os.path.isdir(dp) and len(
                    os.listdir(dp)) > 0:
                if config.get('overwrite_local') or user_confirms(
                        'Local artifact storage location contains old data. Overwrite?'
                ):
                    shutil.rmtree(dp)
                else:
                    click.echo(Fore.RED + 'Dataset creation cancelled.')
                    click.get_current_context().exit()
            # create directory, need exist_ok since we only delete
            # if directory contains files
            # TODO: protect against paths to actual files
            os.makedirs(dp, exist_ok=True)
            self.dataset_path = dp

        ## Set up Imageset
        # s3 download imagesets
        if not config.get('local'):
            imageset_list = config.get('imageset')
            imageset_options = get_imageset_names()
            # prompt for imagesets if not provided
            if imageset_list is None:
                imageset_list = user_selects('Choose imagesets:',
                                             imageset_options,
                                             selection_type="checkbox")
            else:
                for imageset in imageset_list:
                    if imageset not in imageset_options:
                        hint = 'imageset name, no such imageset exists on S3'
                        raise click.exceptions.BadParameter(
                            imageset, param=imageset_list, param_hint=hint)

            ## Download imagesets
            self.imageset_cache.ensure_subpath_exists('imagesets')
            self.imageset_paths = []
            self.download_imagesets(imageset_list)
        # local imagesets
        else:
            imageset_paths = config.get('imageset')
            imageset_list = []
            if imageset_paths is None:
                raise click.exceptions.BadParameter(
                    config,
                    param=config,
                    param_hint='config, no "imageset" filepaths. Config was')
            for imageset in imageset_paths:
                if not os.path.isdir(imageset):
                    raise click.exceptions.BadParameter(
                        config,
                        param=config,
                        param_hint='config, invalid "imageset" path: ' +
                        imageset + ' Config was')
                if os.path.basename(imageset):
                    imageset_list.append(os.path.basename(imageset))
            self.imageset_paths = [
                Path(imageset_path) for imageset_path in imageset_paths
            ]

        ## Set up Basic Metadata
        # TODO: add environment description, git hash, etc
        self.metadata = config.get('metadata', {})
        # handle user defined metadata fields
        if not self.metadata.get('created_by'):
            self.metadata['created_by'] = user_input(
                'Please enter your first and last name:')
        if not self.metadata.get('comments'):
            self.metadata['comments'] = user_input(
                'Please enter descriptive comments about this training:')
        # handle automatic metadata fields
        self.metadata['date_started_at'] = datetime.utcnow().isoformat() + "Z"
        self.metadata[
            'imagesets_used'] = imageset_list if imageset_list else self.imageset_paths

        # handle non-metadata user defined fields
        self.kfolds = config['kfolds'] if config.get('kfolds') else 0
        self.test_percent = config['test_percent'] if config.get(
            'test_percent') else .2

        # Initialize Directory for Dataset
        self.metadata['dataset_name'] = config['dataset_name'] if config.get(
            'dataset_name') else user_input(
                message="What would you like to name this dataset?")
        dir_name = self.dataset_path / self.metadata['dataset_name']
        if os.path.isdir(dir_name):
            if config.get('overwrite_local') or user_confirms(
                    'Local artifact storage location contains old data. Overwrite?'
            ):
                print("WARNING: Deleting existing dataset in cache")
                shutil.rmtree(dir_name)
                os.mkdir(dir_name)
            else:
                click.echo(Fore.RED + 'Dataset creation cancelled.')
                click.get_current_context().exit()
        else:
            os.mkdir(dir_name)

        ## Set up fields for plugin use
        # NOTE: plugins should overwrite the architecture field to something
        # more specific/useful since it is used to name the final uploaded model
        self.metadata[plugin_name] = {'architecture': plugin_name}
        # plugins should only ACCESS the plugin_metadata attibute and add items. They should
        # NEVER assign to the attribute as it will break the reference to the overall metadata dict
        self.plugin_metadata = self.metadata[plugin_name]
        if not config.get('plugin'):
            raise click.exceptions.BadParameter(
                config,
                param=config,
                param_hint='config, no "plugin" field. Config was')
        else:
            self.plugin_config = config.get('plugin')

        # Set up what should be done after dataset creation
        self.upload = config["upload"] if 'upload' in config.keys(
        ) else user_confirms(
            message="Would you like to upload the dataset to S3?")
        self.delete_local = config[
            "delete_local"] if 'delete_local' in config.keys(
            ) else user_confirms(message="Would you like to delete your " +
                                 self.metadata['dataset_name'] + " dataset?")
示例#6
0
class CreateInput(object):
    """Represents a dataset creation input. Contains all plugin-independent
    information necessary for training. Plugins can define their own behavior
    for getting additional information.

    Variables:
        config (dict): all config fields supplied by user
        plugin_cache (RMLCache): cache where plugin can create temp files, and 
            where datasets are stored locally by default
        imageset_cache (RMLCache): cache that stores imagesets locally
        dataset_path (Path): path to where dataset should be written to
        imageset_paths (list): list of paths to imagesets being used
        metadata (dict): holds dataset metadata, currently: created_by, comments,
            dataset_name, date_started_at, imagesets_used, plugin_metadata
        plugin_metadata (dict): holds plugin metadata, currently: plugin_name
        kfolds (int): number of folds user wants in dataset
        test_percent (float): percentage of data should be in test set
        upload (bool): whether the user wants to upload to s3 or not
        delete_local (bool): whether the user wants to delete the local dataset
            or not
    """
    def __init__(self, config: dict = None, plugin_name: str = None):

        if config is None or plugin_name is None:
            raise click.exceptions.UsageError(
                ('You must provide the --config option '
                 'on `ravenml create` when using this plugin command.'))

        self.config = config

        ## Set up Local Cache
        # currently the cache_name subdir is only created IF the plugin places files there
        self.imageset_cache = RMLCache()

        ## Set up Artifact Path
        dp = config.get('dataset_path')
        if dp is None:
            self.imageset_cache.ensure_subpath_exists('datasets')
            self.dataset_path = Path(self.imageset_cache.path / 'datasets')
        else:
            dp = Path(os.path.expanduser(dp))
            # check if local path contains data
            if os.path.exists(dp) and os.path.isdir(dp) and len(
                    os.listdir(dp)) > 0:
                if config.get('overwrite_local') or user_confirms(
                        'Local artifact storage location contains old data. Overwrite?'
                ):
                    shutil.rmtree(dp)
                else:
                    click.echo(Fore.RED + 'Dataset creation cancelled.')
                    click.get_current_context().exit()
            # create directory, need exist_ok since we only delete
            # if directory contains files
            # TODO: protect against paths to actual files
            os.makedirs(dp, exist_ok=True)
            self.dataset_path = dp

        ## Set up Imageset
        # s3 download imagesets
        if not config.get('local'):
            imageset_list = config.get('imageset')
            imageset_options = get_imageset_names()
            # prompt for imagesets if not provided
            if imageset_list is None:
                imageset_list = user_selects('Choose imagesets:',
                                             imageset_options,
                                             selection_type="checkbox")
            else:
                for imageset in imageset_list:
                    if imageset not in imageset_options:
                        hint = 'imageset name, no such imageset exists on S3'
                        raise click.exceptions.BadParameter(
                            imageset, param=imageset_list, param_hint=hint)

            ## Download imagesets
            self.imageset_cache.ensure_subpath_exists('imagesets')
            self.imageset_paths = []
            self.download_imagesets(imageset_list)
        # local imagesets
        else:
            imageset_paths = config.get('imageset')
            imageset_list = []
            if imageset_paths is None:
                raise click.exceptions.BadParameter(
                    config,
                    param=config,
                    param_hint='config, no "imageset" filepaths. Config was')
            for imageset in imageset_paths:
                if not os.path.isdir(imageset):
                    raise click.exceptions.BadParameter(
                        config,
                        param=config,
                        param_hint='config, invalid "imageset" path: ' +
                        imageset + ' Config was')
                if os.path.basename(imageset):
                    imageset_list.append(os.path.basename(imageset))
            self.imageset_paths = [
                Path(imageset_path) for imageset_path in imageset_paths
            ]

        ## Set up Basic Metadata
        # TODO: add environment description, git hash, etc
        self.metadata = config.get('metadata', {})
        # handle user defined metadata fields
        if not self.metadata.get('created_by'):
            self.metadata['created_by'] = user_input(
                'Please enter your first and last name:')
        if not self.metadata.get('comments'):
            self.metadata['comments'] = user_input(
                'Please enter descriptive comments about this training:')
        # handle automatic metadata fields
        self.metadata['date_started_at'] = datetime.utcnow().isoformat() + "Z"
        self.metadata[
            'imagesets_used'] = imageset_list if imageset_list else self.imageset_paths

        # handle non-metadata user defined fields
        self.kfolds = config['kfolds'] if config.get('kfolds') else 0
        self.test_percent = config['test_percent'] if config.get(
            'test_percent') else .2

        # Initialize Directory for Dataset
        self.metadata['dataset_name'] = config['dataset_name'] if config.get(
            'dataset_name') else user_input(
                message="What would you like to name this dataset?")
        dir_name = self.dataset_path / self.metadata['dataset_name']
        if os.path.isdir(dir_name):
            if config.get('overwrite_local') or user_confirms(
                    'Local artifact storage location contains old data. Overwrite?'
            ):
                print("WARNING: Deleting existing dataset in cache")
                shutil.rmtree(dir_name)
                os.mkdir(dir_name)
            else:
                click.echo(Fore.RED + 'Dataset creation cancelled.')
                click.get_current_context().exit()
        else:
            os.mkdir(dir_name)

        ## Set up fields for plugin use
        # NOTE: plugins should overwrite the architecture field to something
        # more specific/useful since it is used to name the final uploaded model
        self.metadata[plugin_name] = {'architecture': plugin_name}
        # plugins should only ACCESS the plugin_metadata attibute and add items. They should
        # NEVER assign to the attribute as it will break the reference to the overall metadata dict
        self.plugin_metadata = self.metadata[plugin_name]
        if not config.get('plugin'):
            raise click.exceptions.BadParameter(
                config,
                param=config,
                param_hint='config, no "plugin" field. Config was')
        else:
            self.plugin_config = config.get('plugin')

        # Set up what should be done after dataset creation
        self.upload = config["upload"] if 'upload' in config.keys(
        ) else user_confirms(
            message="Would you like to upload the dataset to S3?")
        self.delete_local = config[
            "delete_local"] if 'delete_local' in config.keys(
            ) else user_confirms(message="Would you like to delete your " +
                                 self.metadata['dataset_name'] + " dataset?")

    @cli_spinner_wrapper("Downloading imagesets from S3...")
    def download_imagesets(self, imageset_list):
        """Util for downloading all imagesets needed for imageset creation.

        Args:
            imageset_list (list): list of imageset names needed
        """
        # Get image bucket name
        bucketConfig = get_config()
        image_bucket_name = bucketConfig.get('image_bucket_name')
        # Downloads each imageset and appends local path to 'self.imageset_paths'
        for imageset in imageset_list:
            imageset_path = 'imagesets/'
            self.imageset_cache.ensure_subpath_exists(imageset_path)
            download_prefix(image_bucket_name, imageset, self.imageset_cache,
                            imageset_path)
            self.imageset_paths.append(self.imageset_cache.path / 'imagesets' /
                                       imageset)
示例#7
0
Author(s):      Carson Schubert ([email protected])  
Date Created:   03/13/2019

Utility module for managing Jigsaw created datasets.
"""

import json
import boto3
from botocore.exceptions import ClientError
from pathlib import Path
from ravenml.utils.local_cache import RMLCache
from ravenml.utils.config import get_config
from ravenml.utils.aws import list_top_level_bucket_prefixes, download_prefix
from ravenml.data.interfaces import Dataset

dataset_cache = RMLCache('datasets')
# name of dataset bucket field inside config dict
BUCKET_FIELD = 'dataset_bucket_name'


### PUBLIC METHODS ###
def get_dataset_names() -> list:
    """Retrieves the names of all available datasets in bucket pointed to by global config.

    Returns:
        list: dataset names
    """
    config = get_config()
    return list_top_level_bucket_prefixes(config[BUCKET_FIELD])