Exemplo n.º 1
0
def _dockerize(args):
    """

    :param args:
    :return:
    """
    dockerize.main(DisdatConfig.instance(), args)
Exemplo n.º 2
0
    def load():
        """
        Load the data contexts described at meta_dir.  Each of these is a "remote."
        Args:
            local_ctxt_dir: Directory of contexts, e.g., ~/.disdat

        Returns:
            (dict) of 'name':context pairs.

        """
        ctxt_dir = DisdatConfig.instance().get_context_dir()
        if ctxt_dir is None:
            raise Exception(
                "Unable to load context without a metadata directory argument")

        contexts = {}

        files = glob.glob(os.path.join(ctxt_dir, '*'))

        for ctxt in files:
            _logger.debug("Loading context {}...".format(ctxt))
            meta_file = os.path.join(ctxt_dir, ctxt, META_CTXT_FILE)

            if not os.path.isfile(meta_file):
                _logger.debug("No disdat {} meta ctxt data file found.".format(
                    meta_file))
            else:
                with open(meta_file, 'r') as json_file:
                    dc_dict = json.loads(json_file.readline())
                    dc = DataContext(ctxt_dir, **dc_dict)
                contexts[dc.local_ctxt] = dc

        return contexts
Exemplo n.º 3
0
def get_fq_docker_repo_name(is_sagemaker, pipeline_setup_file):
    """
    Produce the fully qualified docker repo name.

    Args:
        is_sagemaker (bool): for sagemaker image
        pipeline_setup_file (str): the path to the setup.py file used to dockerize this pipeline

    Returns:
        (str): The fully qualified docker image repository name
    """
    disdat_config = DisdatConfig.instance()

    repository_prefix = None
    if disdat_config.parser.has_option('docker', 'repository_prefix'):
        repository_prefix = disdat_config.parser.get('docker',
                                                     'repository_prefix')
    if is_sagemaker:
        repository_name = common.make_sagemaker_project_repository_name(
            repository_prefix, pipeline_setup_file)
    else:
        repository_name = common.make_project_repository_name(
            repository_prefix, pipeline_setup_file)

    # Figure out the fully-qualified repository name, i.e., the name
    # including the registry.
    registry_name = disdat_config.parser.get('docker', 'registry').strip('/')
    if registry_name == '*ECR*':
        fq_repository_name = aws.ecr_get_fq_repository_name(repository_name)
    else:
        fq_repository_name = '{}/{}'.format(registry_name, repository_name)

    return fq_repository_name
Exemplo n.º 4
0
def _apply(args):
    """

    :param args:
    :return:
    """
    apply.main(DisdatConfig.instance(), args)
Exemplo n.º 5
0
def main():
    """
    Main is the package entry point.
    """

    if getattr(sys, 'frozen', False):
        here = os.path.join(sys._MEIPASS, 'disdat')
    else:
        here = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))

    args = sys.argv[1:]

    # General options
    parser = argparse.ArgumentParser(prog='dsdt', description='DisDat (dsdt) -- distributed data science management')
    parser.add_argument(
        '--profile',
        type=str,
        default=None,
        help="An AWS credential profile to use when performing AWS operations (default is to use the \'default\' profile)",
        dest='aws_profile'
    )
    parser.add_argument("--verbose", action='store_true', help='Be verbose: Show extra debugging information')
    parser.add_argument("--version", action='version', version='Running Disdat version {}'.format(__version__))
    subparsers = parser.add_subparsers()

    ls_p = subparsers.add_parser('init')
    ls_p.set_defaults(func=lambda args: DisdatConfig.init())

    # Add disdat core subparsers
    fs.add_arg_parser(subparsers)
    add.add_arg_parser(subparsers)
    lineage.add_arg_parser(subparsers)

    # Add additional parsers if we are imported
    resolve_cli_extras(subparsers)

    args = parser.parse_args(args)

    log_level = logging.INFO
    if args.verbose:
        log_level = logging.DEBUG

    log.enable(level=log_level)  # TODO: Add configurable verbosity

    if args.aws_profile is not None:
        os.environ['AWS_PROFILE'] = args.aws_profile

    if hasattr(args,'func'):
        args.func(args)
    else:
        print("dsdt requires arguments, see `dsdt -h` for usage")
Exemplo n.º 6
0
def _run_local(cli, pipeline_setup_file, arglist, backend):
    """
    Run container locally or run sagemaker container locally
    Args:
        cli (bool): Whether we were called from the CLI or API
        pipeline_setup_file (str): The FQ path to the setup.py used to dockerize the pipeline.
        arglist:
        backend:

    Returns:
        output (str): Returns None if there is a failure

    """

    on_macos = False
    if platform == "darwin":
        on_macos = True

    client = docker.from_env()

    environment = {}
    if 'AWS_PROFILE' in os.environ:
        environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']

    environment[common.LOCAL_EXECUTION] = 'True'

    # Todo: Local runs do not yet set resource limits, but when they do, we'll have to set this
    #environment['DISDAT_CPU_COUNT'] = vcpus

    volumes = {}
    aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                               os.path.join(os.environ['HOME'], '.aws'))
    if aws_config_dir is not None and os.path.exists(aws_config_dir):
        volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}

    local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir()
    volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'}

    try:
        if backend == Backend.LocalSageMaker:
            pipeline_image_name = common.make_sagemaker_project_image_name(
                pipeline_setup_file)
            tempdir = tempfile.mkdtemp()
            with open(os.path.join(tempdir, 'hyperparameters.json'),
                      'w') as of:
                json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of)
                args = ['train']  # rewrite to just 'train'
                # On mac OS, tempdir returns /var, but is actually /private/var
                # Add /private since it that dir is shared (and not /var) with Docker.
                if on_macos:
                    localdir = os.path.join('/private', tempdir[1:])
                else:
                    localdir = tempdir
                volumes[localdir] = {
                    'bind': '/opt/ml/input/config/',
                    'mode': 'rw'
                }
                _logger.info("VOLUMES: {}".format(volumes))
        else:
            # Add the actual command to the arglist (for non-sagemaker runs)
            arglist = [ENTRYPOINT_BIN] + arglist
            pipeline_image_name = common.make_project_image_name(
                pipeline_setup_file)

        _logger.debug('Running image {} with arguments {}'.format(
            pipeline_image_name, arglist))

        stdout = client.containers.run(pipeline_image_name,
                                       arglist,
                                       detach=False,
                                       environment=environment,
                                       init=True,
                                       stderr=True,
                                       volumes=volumes)
        stdout = six.ensure_str(stdout)
        if cli: print(stdout)
        return stdout
    except docker.errors.ContainerError as ce:
        _logger.error(
            "Internal error running image {}".format(pipeline_image_name))
        _logger.error("Error: {}".format(six.ensure_str(ce.stderr)))
        return six.ensure_str(ce)
    except docker.errors.ImageNotFound:
        _logger.error(
            "Unable to find the docker image {}".format(pipeline_image_name))
        return None
Exemplo n.º 7
0
def _run_aws_sagemaker(arglist, fq_repository_name, job_name):
    """
    Runs a training job on AWS SageMaker.  This uses the default machine type
    in the disdat.cfg file.

    Args:
        arglist:
        fq_repository_name (str): fully qualified repository name
        job_name:  instance job name

    Returns:
        TrainingJobArn (str)
    """

    disdat_config = DisdatConfig.instance()

    job_name = job_name.replace(
        '_',
        '-')  # b/c SageMaker complains it must be ^[a-zA-Z0-9](-*[a-zA-Z0-9])*

    hyperparameter_dict = _sagemaker_hyperparameters_from_arglist(arglist)

    algorithm_specification = {
        'TrainingImage': fq_repository_name,
        'TrainingInputMode': 'File'
    }

    role_arn = disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_role_arn')

    input_channel_config = [
        {
            'ChannelName': 'disdat_sagemaker_input_blackhole',
            'DataSource': {
                'S3DataSource': {
                    'S3DataType':
                    'S3Prefix',
                    'S3Uri':
                    disdat_config.parser.get(_MODULE_NAME,
                                             'aws_sagemaker_s3_input_uri'),
                    'S3DataDistributionType':
                    'FullyReplicated'
                }
            },
            'ContentType': 'application/javascript',
            'CompressionType': 'None',  # | 'Gzip',
            'RecordWrapperType': 'None'  # | 'RecordIO'
        },
    ]

    output_data_config = {
        'S3OutputPath':
        os.path.join(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_s3_output_uri'), job_name)
    }

    resource_config = {
        'InstanceType':
        disdat_config.parser.get(_MODULE_NAME, 'aws_sagemaker_instance_type'),
        'InstanceCount':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_instance_count')),
        'VolumeSizeInGB':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_volume_sizeGB'))
        # 'VolumeKmsKeyId': 'string'
    }

    vpc_config = None  #'SecurityGroupIds': [], 'Subnets': []}

    stopping_condition = {
        'MaxRuntimeInSeconds':
        int(
            disdat_config.parser.get(_MODULE_NAME,
                                     'aws_sagemaker_max_runtime_sec'))
    }

    tags = [{
        'Key': 'user',
        'Value': 'disdat'
    }, {
        'Key': 'job',
        'Value': job_name
    }]

    if False:
        print("Disdat SageMaker configs")
        print("job name: {}".format(job_name))
        print("hparams: {}".format(hyperparameter_dict))
        print("algorithm: {}".format(algorithm_specification))
        print("Role ARN: {}".format(role_arn))
        print("Input data conf: {}".format(input_channel_config))
        print("Output data conf: {}".format(output_data_config))
        print("Resource conf: {}".format(resource_config))
        print("VPC conf: {}".format(vpc_config))
        print("Stopping condition seconds: {}".format(stopping_condition))
        print("Tags: {}".format(tags))

    client = b3.client('sagemaker', region_name=aws.profile_get_region())

    response = client.create_training_job(
        TrainingJobName=job_name,
        HyperParameters=hyperparameter_dict,
        AlgorithmSpecification=algorithm_specification,
        RoleArn=role_arn,
        InputDataConfig=input_channel_config,
        OutputDataConfig=output_data_config,
        ResourceConfig=resource_config,
        StoppingCondition=stopping_condition,
        Tags=tags)

    _logger.info(
        "Disdat SageMaker create_training_job response {}".format(response))
    return response['TrainingJobArn']
Exemplo n.º 8
0
def _run_aws_batch(arglist, fq_repository_name, job_name, pipeline_image_name,
                   aws_session_token_duration, vcpus, memory, no_submit,
                   job_role_arn):
    """
    Run job on AWS Batch.   Sends to queue configured in disdat.cfg.
    This assumes that you have already created a cluster that will run the jobs
    that have been assigned to that queue.

    Args:
        arglist:
        fq_repository_name (str): The fully qualified docker repository name
        job_name:
        pipeline_image_name:
        aws_session_token_duration:
        vcpus:
        memory:
        no_submit (bool): default False
        job_role_arn (str): Can be None

    Returns:

    """
    def check_role_arn(job_dict, jra):
        """ Check to see if the job desc dictionary contains the same job_role_arn (jra)
        """

        if jra is None:
            if 'jobRoleArn' not in job_dict['containerProperties']:
                return True
        else:
            if 'jobRoleArn' in job_dict['containerProperties']:
                if job_dict['containerProperties']['jobRoleArn'] == jra:
                    return True
        return False

    disdat_config = DisdatConfig.instance()

    # Get the parameter values required to kick off an AWS Batch job.
    # Every batch job must:
    # 1. Have a name
    # 2. Have a job definition that declares which ECR-hosted Docker
    #    image to use.
    # 3. Have a queue that feeds jobs into a compute cluster.
    # 4. The command to execute inside the Docker image; the command
    #    args are more-or-less the same as the ones used to execute
    #    locally using 'dsdt run'

    # Create a Job Definition and upload it.
    # We create per-user job definitions so multiple users do not clobber each other.
    # In addition, we never re-use a job definition, since the user may update
    # the vcpu or memory requirements and those are stuck in the job definition

    job_definition_name = aws.batch_get_job_definition_name(
        pipeline_image_name)

    if disdat_config.parser.has_option(_MODULE_NAME,
                                       'aws_batch_job_definition'):
        job_definition_name = disdat_config.parser.get(
            _MODULE_NAME, 'aws_batch_job_definition')

    # TODO: Look through all of history to find one that matches?
    # TODO: Delete old jobs here or let user do it?
    job_definition_obj = aws.batch_get_latest_job_definition(
        job_definition_name)

    if (job_definition_obj is not None
            and job_definition_obj['containerProperties']['image']
            == fq_repository_name
            and job_definition_obj['containerProperties']['vcpus'] == vcpus
            and job_definition_obj['containerProperties']['memory'] == memory
            and check_role_arn(job_definition_obj, job_role_arn)):

        job_definition_fqn = aws.batch_extract_job_definition_fqn(
            job_definition_obj)

        _logger.info("Re-using prior AWS Batch run job definition : {}".format(
            job_definition_obj))

    else:
        """ Whether None or doesn't match, make a new one """

        job_definition_obj = aws.batch_register_job_definition(
            job_definition_name,
            fq_repository_name,
            vcpus=vcpus,
            memory=memory,
            job_role_arn=job_role_arn)

        job_definition_fqn = aws.batch_get_job_definition(job_definition_name)

        _logger.info(
            "New AWS Batch run job definition {}".format(job_definition_fqn))

    if no_submit:
        # Return the job description object
        return job_definition_obj

    job_queue = disdat_config.parser.get(_MODULE_NAME, 'aws_batch_queue')

    container_overrides = {'command': arglist}

    # Through the magic of boto3_session_cache, the client in our script
    # here can get at AWS profiles and temporary AWS tokens created in
    # part from MFA tokens generated through the user's shells; we don't
    # have to write special code of our own to deal with authenticating
    # with AWS.
    client = b3.client('batch', region_name=aws.profile_get_region())
    # A bigger problem might be that the IAM role executing the job on
    # a batch EC2 instance might not have access to the S3 remote. To
    # get around this, allow the user to create some temporary AWS
    # credentials.

    if aws_session_token_duration > 0 and job_role_arn is None:
        sts_client = b3.client('sts')
        try:
            token = sts_client.get_session_token(
                DurationSeconds=aws_session_token_duration)
            credentials = token['Credentials']
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials['AccessKeyId']
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials['SecretAccessKey']
            }, {
                'name':
                'AWS_SESSION_TOKEN',
                'value':
                credentials['SessionToken']
            }]
        except Exception as e:
            _logger.debug(
                "Unable to generate an STS token, instead trying users default credentials..."
            )
            credentials = b3.session.Session().get_credentials()
            container_overrides['environment'] = [{
                'name':
                'AWS_ACCESS_KEY_ID',
                'value':
                credentials.access_key
            }, {
                'name':
                'AWS_SECRET_ACCESS_KEY',
                'value':
                credentials.secret_key
            }, {
                'name': 'AWS_SESSION_TOKEN',
                'value': credentials.token
            }]

    container_overrides['environment'].append({
        'name': 'DISDAT_CPU_COUNT',
        'value': str(vcpus)
    })

    job = client.submit_job(jobName=job_name,
                            jobDefinition=job_definition_fqn,
                            jobQueue=job_queue,
                            containerOverrides=container_overrides)

    status = job['ResponseMetadata']['HTTPStatusCode']
    if status == 200:
        _logger.info(
            'Job {} (ID {}) with definition {} submitted to AWS Batch queue {}'
            .format(job['jobName'], job['jobId'], job_definition_fqn,
                    job_queue))
        return job
    else:
        _logger.error('Job submission failed: HTTP Status {}'.format())
        return None
Exemplo n.º 9
0
def _run_local(cli, arglist, pipeline_class_name, backend):
    """
    Run container locally or run sagemaker container locally
    Args:
        cli (bool): Whether we were called from the CLI or API
        arglist:
        pipeline_class_name:
        backend:

    Returns:
        output (str): Returns None if there is a failure

    """

    on_macos = False
    if platform == "darwin":
        on_macos = True

    client = docker.from_env()

    environment = {}
    if 'AWS_PROFILE' in os.environ:
        environment['AWS_PROFILE'] = os.environ['AWS_PROFILE']

    environment[common.LOCAL_EXECUTION] = 'True'

    volumes = {}
    aws_config_dir = os.getenv('AWS_CONFIG_DIR',
                               os.path.join(os.environ['HOME'], '.aws'))
    if aws_config_dir is not None and os.path.exists(aws_config_dir):
        volumes[aws_config_dir] = {'bind': '/root/.aws', 'mode': 'rw'}

    local_disdat_meta_dir = DisdatConfig.instance().get_meta_dir()
    volumes[local_disdat_meta_dir] = {'bind': '/root/.disdat', 'mode': 'rw'}

    try:
        if backend == Backend.LocalSageMaker:
            pipeline_image_name = common.make_sagemaker_pipeline_image_name(
                pipeline_class_name)
            tempdir = tempfile.mkdtemp()
            with open(os.path.join(tempdir, 'hyperparameters.json'),
                      'w') as of:
                json.dump(_sagemaker_hyperparameters_from_arglist(arglist), of)
                args = ['train']  # rewrite to just 'train'
                # On mac OS, tempdir returns /var, but is actually /private/var
                # Add /private since it that dir is shared (and not /var) with Docker.
                if on_macos:
                    localdir = os.path.join('/private', tempdir[1:])
                else:
                    localdir = tempdir
                volumes[localdir] = {
                    'bind': '/opt/ml/input/config/',
                    'mode': 'rw'
                }
                _logger.info("VOLUMES: {}".format(volumes))
        else:
            pipeline_image_name = common.make_pipeline_image_name(
                pipeline_class_name)

        _logger.debug('Running image {} with arguments {}'.format(
            pipeline_image_name, arglist))

        stdout = client.containers.run(pipeline_image_name,
                                       arglist,
                                       detach=False,
                                       environment=environment,
                                       init=True,
                                       stderr=True,
                                       volumes=volumes)
        if cli: print stdout
        return stdout
    except docker.errors.ImageNotFound:
        _logger.error(
            "Unable to find the docker image {}".format(pipeline_image_name))
        return None
Exemplo n.º 10
0
def main():
    """
    Main as a function for testing convenience and as a package entry point.

    :return: (shape of input df, shape of pushed df)
    """

    if getattr(sys, 'frozen', False):
        here = sys._MEIPASS
    else:
        here = os.path.join(os.path.abspath(os.path.dirname(__file__)), '..')

    with open(os.path.join(here, 'VERSION')) as version_file:
        __version__ = version_file.read().strip()

    args = sys.argv[1:]

    # General options
    parser = argparse.ArgumentParser(
        prog='dsdt',
        description='DisDat (dsdt) -- distributed data science management')
    parser.add_argument(
        '--profile',
        type=str,
        default=None,
        help=
        "An AWS credential profile to use when performing AWS operations (default is to use the \'default\' profile)",
        dest='aws_profile')
    parser.add_argument("--verbose",
                        action='store_true',
                        help='Be verbose: Show extra debugging information')
    parser.add_argument(
        "--version",
        action='version',
        version='Running Disdat version {}'.format(__version__))
    subparsers = parser.add_subparsers()

    ls_p = subparsers.add_parser('init')
    ls_p.set_defaults(func=lambda args: DisdatConfig.init())

    # autodock
    dockerize_p = subparsers.add_parser(
        'dockerize', description="Dockerizer a particular transform.")
    dockerize_p.add_argument(
        '--config-dir',
        type=str,
        default=None,
        help=
        "A directory containing configuration files for the operating system within the Docker image",
    )
    dockerize_p.add_argument(
        '--os-type',
        type=str,
        default=None,
        help='The base operating system type for the Docker image')
    dockerize_p.add_argument(
        '--os-version',
        type=str,
        default=None,
        help='The base operating system version for the Docker image')
    dockerize_p.add_argument(
        '--push',
        action='store_true',
        help=
        "Push the image to a remote Docker registry (default is to not push; must set 'docker_registry' in Disdat config)",
    )
    dockerize_p.add_argument(
        '--no-build',
        action='store_false',
        help=
        'Do not build an image (only copy files into the Docker build context)',
        dest='build',
    )
    dockerize_p.add_argument(
        "pipe_root",
        type=str,
        help=
        "Root of the Python source tree containing the user-defined transform; must have a setuptools-style setup.py file"
    )
    dockerize_p.add_argument(
        "pipe_cls",
        type=str,
        help="User-defined transform, e.g., module.PipeClass")
    dockerize_p.set_defaults(func=lambda args: _dockerize(args))

    # run
    run_p = subparsers.add_parser(
        'run', description="Run containerized version of transform.")
    run_p.add_argument('--backend',
                       default=run.Backend.default(),
                       type=str,
                       choices=run.Backend.options(),
                       help='An optional batch execution back-end to use')
    run_p.add_argument("--force",
                       action='store_true',
                       help="If there are dependencies, force re-computation.")
    run_p.add_argument(
        "--no-push-input",
        action='store_false',
        help=
        "Do not push the current committed input bundle before execution (default is to push)",
        dest='push_input_bundle')
    run_p.add_argument(
        '-it',
        '--input-tag',
        nargs=1,
        type=str,
        action='append',
        help="Input bundle tags: '-it authoritative:True -it version:0.7.1'")
    run_p.add_argument(
        '-ot',
        '--output-tag',
        nargs=1,
        type=str,
        action='append',
        help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'")
    run_p.add_argument(
        "input_bundle",
        type=str,
        help="Name of source data bundle.  '-' means no input bundle.")
    run_p.add_argument(
        "output_bundle",
        type=str,
        help="Name of destination bundle.  '-' means default output bundle.")
    run_p.add_argument("pipe_cls",
                       type=str,
                       help="User-defined transform, e.g., module.PipeClass")
    run_p.add_argument(
        "pipeline_args",
        type=str,
        nargs=argparse.REMAINDER,
        help="Optional set of parameters for this pipe '--parameter value'")
    run_p.set_defaults(func=lambda args: _run(args))

    # apply
    apply_p = subparsers.add_parser(
        'apply',
        description=
        "Apply a transform to an input bundle to produce an output bundle.")
    apply_p.add_argument(
        '-it',
        '--input-tag',
        nargs=1,
        type=str,
        action='append',
        help="Input bundle tags: '-it authoritative:True -it version:0.7.1'")
    apply_p.add_argument(
        '-ot',
        '--output-tag',
        nargs=1,
        type=str,
        action='append',
        help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'")
    apply_p.add_argument(
        "input_bundle",
        type=str,
        help="Name of source data bundle.  '-' means no input bundle.")
    apply_p.add_argument(
        "output_bundle",
        type=str,
        help="Name of destination bundle.  '-' means default output bundle.")
    apply_p.add_argument("pipe_cls",
                         type=str,
                         help="User-defined transform, e.g., module.PipeClass")
    apply_p.add_argument("--local",
                         action='store_true',
                         help="Run the class locally (even if dockered)")
    apply_p.add_argument(
        "--force",
        action='store_true',
        help="If there are dependencies, force re-computation.")
    apply_p.add_argument(
        "params",
        type=str,
        nargs=argparse.REMAINDER,
        help="Optional set of parameters for this pipe '--parameter value'")
    apply_p.set_defaults(func=lambda args: _apply(args))

    # File system operations
    init_fs_cl(subparsers)

    args = parser.parse_args(args)

    log_level = logging.WARN
    if args.verbose:
        log_level = logging.DEBUG
    logging.basicConfig(level=log_level)

    if args.aws_profile is not None:
        os.environ['AWS_PROFILE'] = args.aws_profile

    args.func(args)
Exemplo n.º 11
0
def main():
    """
    Main as a function for testing convenience and as a package entry point.

    :return: (shape of input df, shape of pushed df)
    """

    if getattr(sys, 'frozen', False):
        here = os.path.join(sys._MEIPASS, 'disdat')
    else:
        here = os.path.abspath(os.path.dirname(__file__))

    with open(os.path.join(here, 'VERSION')) as version_file:
        __version__ = version_file.read().strip()

    args = sys.argv[1:]

    # General options
    parser = argparse.ArgumentParser(prog='dsdt', description='DisDat (dsdt) -- distributed data science management')
    parser.add_argument(
        '--profile',
        type=str,
        default=None,
        help="An AWS credential profile to use when performing AWS operations (default is to use the \'default\' profile)",
        dest='aws_profile'
    )
    parser.add_argument("--verbose", action='store_true', help='Be verbose: Show extra debugging information')
    parser.add_argument("--version", action='version', version='Running Disdat version {}'.format(__version__))
    subparsers = parser.add_subparsers()

    ls_p = subparsers.add_parser('init')
    ls_p.set_defaults(func=lambda args: DisdatConfig.init())

    # dockerize
    subparsers = dockerize.add_arg_parser(subparsers)

    # run
    subparsers = run.add_arg_parser(subparsers)

    # apply
    apply_p = subparsers.add_parser('apply', description="Apply a transform to an input bundle to produce an output bundle.")
    apply_p.add_argument('-cs', '--central-scheduler', action='store_true', default=False, help="Use a central Luigi scheduler (defaults to local scheduler)")
    apply_p.add_argument('-w', '--workers', type=int, default=1, help="Number of Luigi workers on this node")
    apply_p.add_argument('-it', '--input-tag', nargs=1, type=str, action='append',
                         help="Input bundle tags: '-it authoritative:True -it version:0.7.1'")
    apply_p.add_argument('-ot', '--output-tag', nargs=1, type=str, action='append',
                         help="Output bundle tags: '-ot authoritative:True -ot version:0.7.1'")
    apply_p.add_argument('-o', '--output-bundle', type=str, default='-',
                         help="Name output bundle: '-o my.output.bundle'.  Default name is '<TaskName>_<param_hash>'")
    apply_p.add_argument('-f', '--force', action='store_true', help="Force re-computation of only this task.")
    apply_p.add_argument('--force-all', action='store_true', help="Force re-computation of ALL upstream tasks.")
    apply_p.add_argument('--incremental-push', action='store_true', help="Commit and push each task's bundle as it is produced to the remote.")
    apply_p.add_argument('--incremental-pull', action='store_true', help="Localize bundles as they are needed by downstream tasks from the remote.")
    apply_p.add_argument('pipe_cls', type=load_class, help="User-defined transform, e.g., 'module.PipeClass'")
    apply_p.add_argument('params', type=str,  nargs=argparse.REMAINDER,
                         help="Optional set of parameters for this pipe '--parameter value'")
    apply_p.set_defaults(func=lambda args: apply.cli_apply(args))

    # File system operations
    init_fs_cl(subparsers)

    # add
    init_add_cl(subparsers)

    # add
    init_lineage_cl(subparsers)

    args = parser.parse_args(args)

    log_level = logging.INFO
    if args.verbose:
        log_level = logging.DEBUG

    log.enable(level=log_level)  # TODO: Add configurable verbosity

    if args.aws_profile is not None:
        os.environ['AWS_PROFILE'] = args.aws_profile

    args.func(args)