Exemplo n.º 1
0
def test_add_remote():
    api.delete_context(TEST_CONTEXT)
    api.context(context_name=TEST_CONTEXT)

    # Setup moto s3 resources
    s3_client = boto3.client('s3')
    s3_resource = boto3.resource('s3', region_name='us-east-1')
    s3_resource.create_bucket(Bucket=TEST_BUCKET)

    # Make sure bucket is empty
    objects = s3_client.list_objects(Bucket=TEST_BUCKET)
    assert 'Contents' not in objects, 'Bucket should be empty'

    # Bind remote context with just bucket
    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL)

    with api.Bundle(TEST_CONTEXT) as b:
        b.name = 'output'
        b.add_data([1, 3, 5])

    b.commit()
    b.push()

    # Bind remote to new context with bucket and key
    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_KEY_URL)

    with api.Bundle(TEST_CONTEXT) as b:
        b.name = 'output'
        b.add_data([1, 3, 5])

    b.commit()
    b.push()

    api.delete_context(TEST_CONTEXT)
Exemplo n.º 2
0
def test():
    """

    Returns:

    """

    api.context(TEST_CONTEXT)
    api.remote(TEST_CONTEXT, TEST_CONTEXT, REMOTE_URL, force=True)

    with api.Bundle(TEST_CONTEXT, TEST_NAME, owner=getpass.getuser()) as b:
        for i in range(3):
            with b.add_file('output_{}'.format(i)).open('w') as of:
                of.write("some text for the {} file".format(i))

    b.commit().push()

    b.rm()

    b.pull(localize=False)

    api.apply(TEST_CONTEXT,
              '-',
              'test_output',
              'ConsumeExtDep',
              incremental_pull=True)

    api.delete_context(TEST_CONTEXT, remote=True)
Exemplo n.º 3
0
def create_remote_file_bundle(name):
    """ Create a bundle with
     a.) an unmanaged s3 path
     b.) a managed s3 path
     c.) a managed s3 path with a directory
     """
    s3_resource = boto3.resource('s3', region_name='us-east-1')
    s3_resource.create_bucket(Bucket=TEST_BUCKET)

    # Copy a local file to moto s3 bucket
    saved_md5 = md5_file(__file__)
    aws_s3.put_s3_file(__file__, TEST_BUCKET_URL)

    s3_path_1 = os.path.join(TEST_BUCKET_URL, os.path.basename(__file__))

    with api.Bundle(TEST_CONTEXT, name=name) as b:
        s3_path_2 = b.get_remote_file('test_s3_file.txt')
        aws_s3.cp_local_to_s3_file(__file__, s3_path_2)
        s3_path_3 = os.path.join(b.get_remote_directory('vince/klartho'),
                                 'test_s3_file.txt')
        aws_s3.cp_local_to_s3_file(__file__, s3_path_3)

        b.add_data([s3_path_1, s3_path_2, s3_path_3])
        b.add_tags({'info': 'added an s3 file'})

    saved_uuid = b.uuid

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    b.commit()
    md5 = md5_file(b.data[0])
    print(md5)
    print(saved_md5)
    assert md5 == saved_md5
Exemplo n.º 4
0
def test_pull(run_test):
    s3_client = boto3.client('s3')
    s3_resource = boto3.resource('s3', region_name='us-east-1')
    s3_resource.create_bucket(Bucket=TEST_BUCKET)
    bucket = s3_resource.Bucket(TEST_BUCKET)

    objects = s3_client.list_objects(Bucket=TEST_BUCKET)
    assert 'Contents' not in objects, 'Bucket should be empty'

    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'
    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL)

    _ = api.Bundle(TEST_CONTEXT, name='remote_test', data='Hello')
    bundle = api.get(TEST_CONTEXT, 'remote_test')

    assert bundle.data == 'Hello'

    bundle.commit()
    bundle.push()

    objects = s3_client.list_objects(Bucket=TEST_BUCKET)
    assert 'Contents' in objects, 'Bucket should not be empty'
    assert len(objects['Contents']) > 0, 'Bucket should not be empty'

    api.delete_context(context_name=TEST_CONTEXT)
    api.context(context_name=TEST_CONTEXT)
    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL)
    api.pull(TEST_CONTEXT)

    pulled_bundles = api.search(TEST_CONTEXT)
    assert len(pulled_bundles) > 0, 'No bundles were pulled'
    assert pulled_bundles[0].data == 'Hello', 'Bundle contains incorrect data'

    bucket.objects.all().delete()
    bucket.delete()
Exemplo n.º 5
0
def test_string_task():
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    _ = api.Bundle(TEST_CONTEXT, name='string_task', data='output')
    data = api.get(TEST_CONTEXT, 'string_task').data

    assert data == 'output', 'Data did not match output'
    assert type(data) == six.text_type, 'Data is not string'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 6
0
def test_float_task():
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    _ = api.Bundle(TEST_CONTEXT, name='float_task', data=2.5)
    data = api.get(TEST_CONTEXT, 'float_task').data

    assert data == 2.5, 'Data did not match output'
    assert type(data) == float, 'Data is not float'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 7
0
def test_list_task():
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    _ = api.Bundle(TEST_CONTEXT, name='list_task', data=[1, 2, 3])
    data = api.get(TEST_CONTEXT, 'list_task').data

    assert np.array_equal(data, [1, 2, 3]), 'Data did not match output'
    assert type(data) == np.ndarray, 'Data is not list'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 8
0
def test_args_bundle():
    """ Create bundle, store args.
    """

    with api.Bundle(TEST_CONTEXT) as b:
        b.add_params(serialized_json_args)
        b.name = 'output'

    b = api.get(TEST_CONTEXT, 'output')

    assert (b.params == serialized_json_args)
Exemplo n.º 9
0
def test_dict_task():
    setup()
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    d = {'hello': ['world']}
    _ = api.Bundle(TEST_CONTEXT, name='dict_task', data=d)
    d = api.get(TEST_CONTEXT, 'dict_task').data

    assert d == {'hello': ['world']}, 'Data did not match output'
    assert type(d) == dict, 'Data is not dict'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 10
0
def test_df_task():
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    df = pd.DataFrame()
    df['a'] = [1, 2, 3]

    _ = api.Bundle(TEST_CONTEXT, name='df_task', data=df)
    data = api.get(TEST_CONTEXT, 'df_task').data

    assert df.equals(data), 'Data did not match output'
    assert type(data) == pd.DataFrame, 'Data is not df'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 11
0
def test_local_file(run_test):
    """ Test copying in local file """

    local_fp = tempfile.NamedTemporaryFile()
    local_fp.write(b'an external local file')
    local_fp.flush()

    with api.Bundle(TEST_CONTEXT, name=TEST_BUNDLE) as b:
        b.add_data(local_fp.name)
        b.add_tags({'info':'added a local file'})

    saved_uuid = b.uuid
    saved_md5 = md5_file(local_fp.name)
    local_fp.close()

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    assert md5_file(b.data) == saved_md5
Exemplo n.º 12
0
def test_file_task():
    assert len(api.search(TEST_CONTEXT)) == 0, 'Context should be empty'

    with api.Bundle(TEST_CONTEXT, name='file_task') as b:
        f1 = b.get_file("test.txt")
        with open(f1, mode='w') as f:
            f.write('5')
        b.add_data(f1)

    output_path = api.get(TEST_CONTEXT, 'file_task').data

    with open(output_path) as f:
        output = f.read()

    assert output == '5', 'Data did not match output'
    assert type(output_path) == str, 'Data is not path'
    assert len(
        api.search(TEST_CONTEXT)) == 1, 'Only one bundle should be present'
Exemplo n.º 13
0
def test_independent_context():
    context_1_name = '__test_context_1__'
    context_2_name = '__test_context_2__'

    api.context(context_1_name)
    api.context(context_2_name)

    _ = api.Bundle(context_1_name, name='context_test', data=2)

    assert len(api.search(
        context_1_name)) == 1, 'Only one bundle should be in context one'
    assert len(api.search(context_2_name)) == 0, 'Context two should be empty'

    api.delete_context(context_name=context_1_name)
    api.delete_context(context_name=context_2_name)

    assert context_1_name not in api.ls_contexts(
    ), 'Contexts should be removed'
    assert context_2_name not in api.ls_contexts(
    ), 'Contexts should be removed'
Exemplo n.º 14
0
def test_zero_copy_local_file(run_test):
    """ Test managed path of a local file """

    with api.Bundle(TEST_CONTEXT, name=TEST_BUNDLE) as b:
        f1 = b.get_file("file_1.txt")
        f2 = b.get_file("file_2.txt")
        with f1.open(mode='w') as f:
            f.write("This is our first file!")
        with f2.open(mode='w') as f:
            f.write("This is our second file!")
        b.add_data([f1,f2])
        b.add_params({'type':'file'})

    saved_uuid = b.uuid
    saved_f1_md5 = md5_file(f1.path)
    saved_f2_md5 = md5_file(f2.path)

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    assert md5_file(b.data[0]) == saved_f1_md5
    assert md5_file(b.data[1]) == saved_f2_md5
Exemplo n.º 15
0
def create_local_file_bundle(name):
    """
    Create a local file bundle.  It has an external file,
    a managed file, and a managed dir file.

    Args:
        name:

    Returns:

    """
    local_fp = tempfile.NamedTemporaryFile()
    local_fp.write(b'an external local file in bundle')
    local_fp.flush()

    with api.Bundle(TEST_CONTEXT, name=name) as b:
        f1 = b.get_file("file_1.txt")
        f2 = b.get_file("file_2.txt")
        f3 = os.path.join(b.get_directory("vince/klartho"), 'file_3.txt')
        with open(f1, mode='w') as f:
            f.write("This is our first file! {}".format(name))
        with open(f2, mode='w') as f:
            f.write("This is our second file! {}".format(name))
        with open(f3, mode='w') as f:
            f.write("This is our third file! {}".format(name))
        b.add_data([local_fp.name, f1, f2, f3])
        hashes = {
            "f{}".format(i): md5_file(f)
            for i, f in enumerate([local_fp.name, f1, f2, f3])
        }
        b.add_tags(hashes)

    local_fp.close()

    saved_uuid = b.uuid
    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    b.commit()
    for i, f in enumerate(b.data):
        assert md5_file(f) == hashes["f{}".format(i)]

    return b
Exemplo n.º 16
0
def test_zero_copy_s3_file(run_test):
    """ Test managed path in local file """
    s3_resource = boto3.resource('s3')
    s3_resource.create_bucket(Bucket=TEST_BUCKET)

    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL)

    saved_md5 = md5_file(__file__)

    with api.Bundle(TEST_CONTEXT, name=TEST_BUNDLE) as b:
        s3_target = b.get_remote_file('test_s3_file.txt')
        aws_s3.cp_local_to_s3_file(__file__, s3_target.path)
        b.add_data(s3_target)
        b.add_tags({'info': 'added an s3 file'})
    saved_uuid = b.uuid

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    b.pull(localize=True)
    md5 = md5_file(b.data)
    print(md5)
    print(saved_md5)
    assert md5 == saved_md5
Exemplo n.º 17
0
def test_copy_in_s3_file_with_remote(run_test):
    """ Test copying in s3 file
    The file should be copied into the remote context
    """

    s3_resource = boto3.resource('s3')
    s3_resource.create_bucket(Bucket=TEST_BUCKET)

    api.remote(TEST_CONTEXT, TEST_REMOTE, TEST_BUCKET_URL)

    # Copy a local file to moto s3 bucket
    saved_md5 = md5_file(__file__)
    aws_s3.put_s3_file(__file__, TEST_BUCKET_URL)

    s3_file = os.path.join(TEST_BUCKET_URL, os.path.basename(__file__))

    with api.Bundle(TEST_CONTEXT, name=TEST_BUNDLE) as b:
        b.add_data(s3_file)
        b.add_tags({'info': 'added an s3 file'})
    saved_uuid = b.uuid

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    assert b.data.startswith("s3://")
Exemplo n.º 18
0
def test_copy_in_s3_file(run_test):
    """ Test copying in s3 file
    The file should be copied into the local context
    """

    s3_resource = boto3.resource('s3', region_name="us-east-1")
    s3_resource.create_bucket(Bucket=TEST_BUCKET)

    # Copy a local file to moto s3 bucket
    saved_md5 = md5_file(__file__)
    aws_s3.put_s3_file(__file__, TEST_BUCKET_URL)

    s3_file = os.path.join(TEST_BUCKET_URL, os.path.basename(__file__))

    with api.Bundle(TEST_CONTEXT, name=TEST_BUNDLE) as b:
        b.add_data(s3_file)
        b.add_tags({'info': 'added an s3 file'})
    saved_uuid = b.uuid

    b = api.get(TEST_CONTEXT, None, uuid=saved_uuid)
    md5 = md5_file(b.data)
    print(md5)
    print(saved_md5)
    assert md5 == saved_md5
Exemplo n.º 19
0
    def add_external_dependency(self,
                                param_name,
                                task_class,
                                params,
                                human_name=None,
                                uuid=None):
        """
        Disdat Pipe API Function

        Add an external task and its parameters to our requirements.   What this means is that
        there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls
        flatten(self.requires())).  And what that means is that this requirement can only be satisfied
        by the bundle actually existing.

        Create ersatz ExternalDepTask parameterized by uuid and processing_name
        Note: it is possible to use class/params when searching by class, params, but this makes all external
        dependencies look the same in the code.  Win.

        NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during
        execution and that your requires function is no longer deterministic.   You must add caching to your
        requires function to handle this scenario.

        Example with class variable bundle_uuid:
        ``
        if self.bundle_uuid is None:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result')
            self.bundle_uuid = bundle.uuid
        else:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid)
        ``

        TODO: Consider pushing caching into this layer.

        Args:
            param_name (str): The parameter name this bundle assumes when passed to Pipe.run
            task_class (object):  Class name of upstream task if looking for external bundle by processing_id.
            params (dict):  Dictionary of parameters if looking for external bundle by processing_id.
            human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name.  Trumps task_class and params.
            uuid (str): Resolve dependency by explicit UUID, trumps task_class, params and human_name.

        Returns:
            `api.Bundle` or None

        """
        import disdat.api as api

        if task_class is not None and not isinstance(params, dict):
            error = "add_external_dependency requires parameter dictionary"
            raise Exception(error)

        assert (param_name not in self.add_deps)

        try:
            if uuid is not None:
                hfr = self.pfs.get_hframe_by_uuid(
                    uuid, data_context=self.data_context)
            elif human_name is not None:
                hfr = self.pfs.get_latest_hframe(
                    human_name, data_context=self.data_context)
            else:
                # we propagate the same inputs and the same output dir for every upstream task!
                params.update({
                    'user_arg_name': param_name,
                    'data_context': self.data_context
                })
                p = task_class(**params)
                hfr = self.pfs.get_hframe_by_proc(
                    p.processing_id(), data_context=self.data_context)

            if hfr is None:
                error_str = "Disdat can't resolve external bundle from class[{}] params[{}] name[{}] uuid[{}]".format(
                    task_class, params, human_name, uuid)
                raise ExtDepError(error_str)

            bundle = api.Bundle(
                self.data_context.get_local_name()).fill_from_hfr(hfr)

        except ExtDepError as error:  # Swallow and allow Luigi to determine task is not available.
            _logger.error(error_str)
            bundle = None

        except Exception as error:
            _logger.error(error)
            bundle = None

        finally:
            if bundle is None:
                self.add_deps[param_name] = (
                    luigi.task.externalize(ExternalDepTask), {
                        'uuid': 'None',
                        'processing_name': 'None'
                    })
            else:
                self.add_deps[param_name] = (
                    luigi.task.externalize(ExternalDepTask), {
                        'uuid': bundle.uuid,
                        'processing_name': bundle.processing_name
                    })

        return bundle
Exemplo n.º 20
0
    def add_external_dependency(self,
                                param_name,
                                task_class,
                                params,
                                human_name=None,
                                uuid=None):
        """
        Disdat Pipe API Function

        Add an external task and its parameters to our requirements.   What this means is that
        there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls
        flatten(self.requires())).  And what that means is that this requirement can only be satisfied
        by the bundle actually existing.

        NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during
        execution and that your requires function is no longer deterministic.   You must add caching to your
        requires function to handle this scenario.

        Example with class variable bundle_uuid:
        ``
        if self.bundle_uuid is None:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result')
            self.bundle_uuid = bundle.uuid
        else:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid)
        ``

        Args:
            param_name (str): The parameter name this bundle assumes when passed to Pipe.run
            task_class (:object):  Must always set class name of upstream task.
            params (:dict):  Dictionary of parameters for this task.  Note if UUID is set, then params are ignored!
            human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name.  Trumps task_class and params.
            uuid (str): Resolve dependency by explicit UUID, trumps task_class and params, and human_name.

        Returns:
            None

        """

        # for the bundle object
        import disdat.api as api

        if not isinstance(params, dict):
            error = "add_dependency third argument must be a dictionary of parameters"
            raise Exception(error)

        assert (param_name not in self.add_deps)

        try:
            if uuid is not None:
                hfr = self.pfs.get_hframe_by_uuid(
                    uuid, data_context=self.data_context)
            elif human_name is not None:
                hfr = self.pfs.get_latest_hframe(
                    human_name, data_context=self.data_context)
            else:
                p = task_class(**params)
                hfr = self.pfs.get_hframe_by_proc(
                    p.pipe_id(), data_context=self.data_context)

            bundle = api.Bundle(self.data_context.get_local_name(), 'unknown')

            bundle.fill_from_hfr(hfr)

            if uuid is not None or human_name is not None:
                params = task_class._put_subcls_params(bundle.params)

            self.add_deps[param_name] = (luigi.task.externalize(task_class),
                                         params)

        except Exception as error:
            _logger.warning(
                "Unable to resolve external bundle made by class ({}): {}".
                format(task_class, error))
            return None

        return bundle