示例#1
0
    def download_file(self, context, target_file):
        check.str_param(target_file, 'target_file')

        target_path = os.path.join(self.target_folder, target_file)

        if self.skip_if_present and safe_isfile(target_path):
            context.log.info(
                'Skipping download, file already present at {target_path}'.
                format(target_path=target_path))
        else:
            full_key = self.key + '/' + target_file
            if os.path.dirname(target_path):
                mkdir_p(os.path.dirname(target_path))

            context.log.info(
                'Starting download of {bucket}/{key} to {target_path}'.format(
                    bucket=self.bucket, key=full_key, target_path=target_path))

            headers = context.resources.s3.head_object(Bucket=self.bucket,
                                                       Key=full_key)
            logger = S3Logger(context.log.debug, self.bucket, full_key,
                              target_path, int(headers['ContentLength']))
            context.resources.s3.download_file(Bucket=self.bucket,
                                               Key=full_key,
                                               Filename=target_path,
                                               Callback=logger)

        return target_path
示例#2
0
def _download_from_s3_to_file(session, context, bucket, key, target_folder, skip_if_present):
    # TODO: remove context argument once we support resource logging

    # file name is S3 key path suffix after last /
    target_file = os.path.join(target_folder, key.split('/')[-1])

    if skip_if_present and safe_isfile(target_file):
        context.log.info(
            'Skipping download, file already present at {target_file}'.format(
                target_file=target_file
            )
        )
    else:
        if not os.path.exists(target_folder):
            mkdir_p(target_folder)

        context.log.info(
            'Starting download of {bucket}/{key} to {target_file}'.format(
                bucket=bucket, key=key, target_file=target_file
            )
        )

        headers = session.head_object(Bucket=bucket, Key=key)
        logger = S3Logger(
            context.log.debug, bucket, key, target_file, int(headers['ContentLength'])
        )
        session.download_file(Bucket=bucket, Key=key, Filename=target_file, Callback=logger)
    return target_file
示例#3
0
def execute_create_notebook(notebook, force_overwrite, **kwargs):
    if not re.match(r'^[a-zA-Z0-9\-_\\/]+$', notebook):
        raise click.BadOptionUsage(
            notebook,
            ('Notebook name {name} is not valid, '
             'cannot contain anything except alphanumeric characters, '
             '-, _, \\ and / for path manipulation').format(name=notebook),
        )

    notebook_path = os.path.join(
        os.getcwd(),
        notebook if notebook.endswith('.ipynb') else notebook + ".ipynb")

    notebook_dir = os.path.dirname(notebook_path)
    if not os.path.exists(notebook_dir):
        os.makedirs(notebook_dir)

    if not force_overwrite and safe_isfile(notebook_path):
        click.confirm(
            ('Warning, {notebook_path} already exists and continuing '
             'will overwrite the existing notebook. '
             'Are you sure you want to continue?').format(
                 notebook_path=notebook_path),
            abort=True,
        )
    register_repo_info = get_register_repo_info(kwargs)

    with open(notebook_path, 'w') as f:
        f.write(get_notebook_scaffolding(register_repo_info))
        click.echo("Created new dagstermill notebook at {path}".format(
            path=notebook_path))
示例#4
0
def download_from_s3(context):
    '''Download an object from s3.

    Args:
        info (ExpectationExecutionInfo): Must expose a boto3 S3 client as its `s3` resource.

    Returns:
        str:
            The path to the downloaded object.
    '''
    results = []
    for file_ in context.solid_config:
        bucket = file_['bucket']
        key = file_['key']
        target_path = file_.get('target_path') or key

        if target_path is None:
            target_path = context.resources.tempfile.tempfile().name

        if file_['skip_if_present'] and safe_isfile(target_path):
            context.log.info(
                'Skipping download, file already present at {target_path}'.
                format(target_path=target_path))
        else:
            if os.path.dirname(target_path):
                mkdir_p(os.path.dirname(target_path))

            context.resources.s3.download_file(bucket, key, target_path)
        results.append(target_path)
    return results
示例#5
0
def download_from_s3(context):
    (bucket, key, target_folder,
     skip_if_present) = (context.solid_config.get(k)
                         for k in ('bucket', 'key', 'target_folder',
                                   'skip_if_present'))

    # file name is S3 key path suffix after last /
    target_file = os.path.join(target_folder, key.split('/')[-1])

    if skip_if_present and safe_isfile(target_file):
        context.log.info(
            'Skipping download, file already present at {target_file}'.format(
                target_file=target_file))
    else:
        if not os.path.exists(target_folder):
            mkdir_p(target_folder)

        context.log.info(
            'Starting download of {bucket}/{key} to {target_file}'.format(
                bucket=bucket, key=key, target_file=target_file))
        s3 = boto3.client('s3')

        headers = s3.head_object(Bucket=bucket, Key=key)
        logger = S3Logger(context.log.debug, bucket, key, target_file,
                          int(headers['ContentLength']))
        s3.download_file(Bucket=bucket,
                         Key=key,
                         Filename=target_file,
                         Callback=logger)

    return target_file
示例#6
0
def file_exists_at_path_type_check(value):
    if not isinstance(value, six.string_types):
        raise Failure(
            'FileExistsAtPath must be a string in memory. Got {value}'.format(
                value=repr(value)))
    if not safe_isfile(value):
        raise Failure(
            ('FileExistsAtPath must be a path that points to a file that '
             'exists. "{value}" does not exist on disk').format(value=value))
示例#7
0
def unzip_file(
    context,
    archive_paths,
    archive_members,
    # destination_dir=None
):
    # FIXME
    # archive_path = info.config['archive_path']
    # archive_member = info.config['archive_member']
    results = []
    for (i, archive_path) in enumerate(archive_paths):
        destination_dir = (
            # info.config['destination_dir'] or
            os.path.dirname(archive_path))
        if archive_members:
            archive_member = archive_members[i]
        else:
            archive_member = None

        with zipfile.ZipFile(archive_path, 'r') as zip_ref:
            if archive_member is not None:
                target_path = os.path.join(destination_dir, archive_member)
                is_file = safe_isfile(target_path)
                is_dir = os.path.isdir(target_path)
                if not (context.solid_config['skip_if_present'] and
                        (is_file or is_dir)):
                    zip_ref.extract(archive_member, destination_dir)
                else:
                    if is_file:
                        context.log.info(
                            'Skipping unarchive of {archive_member} from {archive_path}, '
                            'file already present at {target_path}'.format(
                                archive_member=archive_member,
                                archive_path=archive_path,
                                target_path=target_path,
                            ))
                    if is_dir:
                        context.log.info(
                            'Skipping unarchive of {archive_member} from {archive_path}, '
                            'directory already present at {target_path}'.
                            format(
                                archive_member=archive_member,
                                archive_path=archive_path,
                                target_path=target_path,
                            ))
            else:
                if not (context.solid_config['skip_if_present'] and is_dir):
                    zip_ref.extractall(destination_dir)
                else:
                    context.log.info(
                        'Skipping unarchive of {archive_path}, directory already present '
                        'at {target_path}'.format(archive_path=archive_path,
                                                  target_path=target_path))
        results.append(target_path)
    return results
示例#8
0
文件: cli.py 项目: saket1994/dagster
def execute_create_notebook(notebook, solid_name, force_overwrite, **kwargs):
    if not re.match(r'^[a-zA-Z0-9\-_\\/]+$', notebook):
        raise click.BadOptionUsage(
            notebook,
            ('Notebook name {name} is not valid, '
             'cannot contain anything except alphanumeric characters, '
             '-, _, \\ and / for path manipulation').format(name=notebook),
        )

    notebook_path = os.path.join(
        os.getcwd(),
        notebook if notebook.endswith('.ipynb') else notebook + ".ipynb")

    notebook_dir = os.path.dirname(notebook_path)
    if not os.path.exists(notebook_dir):
        os.makedirs(notebook_dir)

    if not force_overwrite and safe_isfile(notebook_path):
        click.confirm(
            ('Warning, {notebook_path} already exists and continuing '
             'will overwrite the existing notebook. '
             'Are you sure you want to continue?').format(
                 notebook_path=notebook_path),
            abort=True,
        )

    if not solid_name:
        solid_name = os.path.basename(notebook_path).split(".")[0]

    repository_target_info = load_target_info_from_cli_args(kwargs)
    module_target_info = get_module_target_function(repository_target_info)

    if module_target_info:
        module = module_target_info.module_name
        fn_name = module_target_info.fn_name
        RegisterRepoInfo = namedtuple(
            'RegisterRepoInfo', 'import_statement declaration_statement')
        register_repo_info = RegisterRepoInfo(
            "from {module} import {fn_name}".format(module=module,
                                                    fn_name=fn_name),
            "dm.declare_as_solid({fn_name}(), '{solid_name}')".format(
                fn_name=fn_name, solid_name=solid_name),
        )
    else:
        raise click.UsageError(
            "Cannot instantiate notebook with repository definition given by a function from a file"
        )

    with open(notebook_path, 'w') as f:
        f.write(get_notebook_scaffolding(register_repo_info))
        click.echo("Created new dagstermill notebook at {path}".format(
            path=notebook_path))
示例#9
0
def gunzipper(_, gzip_file):
    """gunzips /path/to/foo.gz to /path/to/raw/2019/01/01/data.json"""
    # TODO: take date as an input

    path_prefix = os.path.dirname(gzip_file)
    output_folder = os.path.join(path_prefix, "raw/2019/01/01")
    outfile = os.path.join(output_folder, "data.json")

    if not safe_isfile(outfile):
        mkdir_p(output_folder)

        with gzip.open(gzip_file, "rb") as f_in, open(outfile, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    return [path_prefix]
示例#10
0
def gunzipper(_, gzip_file):
    '''gunzips /path/to/foo.gz to /path/to/raw/2019/01/01/data.json
    '''
    # TODO: take date as an input

    path_prefix = os.path.dirname(gzip_file)
    output_folder = os.path.join(path_prefix, 'raw/2019/01/01')
    outfile = os.path.join(output_folder, 'data.json')

    if not safe_isfile(outfile):
        mkdir_p(output_folder)

        with gzip.open(gzip_file, 'rb') as f_in, open(outfile, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

    return [path_prefix]
示例#11
0
def file_exists_at_path_type_check(_, value):
    if not isinstance(value, six.string_types):
        return TypeCheck(
            success=False,
            description='FileExistsAtPath must be a string in memory. Got {value}'.format(
                value=repr(value)
            ),
        )
    if not safe_isfile(value):
        return TypeCheck(
            success=False,
            description=(
                'FileExistsAtPath must be a path that points to a file that '
                'exists. "{value}" does not exist on disk'
            ).format(value=value),
        )

    return True
示例#12
0
def execute_create_notebook(notebook, force_overwrite, kernel):
    notebook_path = os.path.join(
        os.getcwd(),
        notebook if notebook.endswith('.ipynb') else notebook + ".ipynb")

    notebook_dir = os.path.dirname(notebook_path)
    mkdir_p(notebook_dir)

    if not force_overwrite and safe_isfile(notebook_path):
        click.confirm(
            ('Warning, {notebook_path} already exists and continuing '
             'will overwrite the existing notebook. '
             'Are you sure you want to continue?').format(
                 notebook_path=notebook_path),
            abort=True,
        )

    with open(notebook_path, 'w') as f:
        f.write(get_notebook_scaffolding(get_kernelspec(kernel)))
        click.echo("Created new dagstermill notebook at {path}".format(
            path=notebook_path))
示例#13
0
def test_safe_isfile():
    assert safe_isfile(file_relative_path(__file__, 'test_file_utils.py'))
    assert not safe_isfile(file_relative_path(__file__, 'not_a_file.py'))
示例#14
0
def test_safe_isfile():
    assert safe_isfile(script_relative_path('test_safe_isfile.py'))
    assert not safe_isfile(script_relative_path('test_safe_isfile_foobar.py'))
示例#15
0
def test_safe_isfile():
    assert safe_isfile(script_relative_path('test_file_utils.py'))
    assert not safe_isfile(script_relative_path('not_a_file.py'))