Exemplo n.º 1
0
    def __init__(
        self,
        bucket,
        local_dir=None,
        inst_data=None,
        prefix="dagster",
        use_ssl=True,
        verify=True,
        verify_cert_path=None,
        endpoint_url=None,
        skip_empty_files=False,
    ):
        _verify = False if not verify else verify_cert_path
        self._s3_session = boto3.resource(
            "s3", use_ssl=use_ssl, verify=_verify, endpoint_url=endpoint_url
        ).meta.client
        self._s3_bucket = check.str_param(bucket, "bucket")
        self._s3_prefix = check.str_param(prefix, "prefix")

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
        self._skip_empty_files = check.bool_param(skip_empty_files, "skip_empty_files")
Exemplo n.º 2
0
def events_jar():
    git_repo_root = (subprocess.check_output(
        ["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip())

    temp_dir = os.path.join(get_system_temp_directory(),
                            "dagster_examples_tests",
                            "event_pipeline_demo_tests")

    mkdir_p(temp_dir)
    dst = os.path.join(temp_dir, "events.jar")

    if os.path.exists(dst):
        print("events jar already exists, skipping")  # pylint: disable=print-call
    else:
        subprocess.check_call(["sbt", "events/assembly"],
                              cwd=os.path.join(git_repo_root, "scala_modules"))

        src = os.path.join(
            git_repo_root,
            "scala_modules",
            "events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar",
        )
        subprocess.check_call(["cp", src, dst])

    yield dst
Exemplo n.º 3
0
    def __init__(self,
                 run_id,
                 type_storage_plugin_registry=None,
                 base_dir=None):
        self.run_id = check.str_param(run_id, 'run_id')
        type_storage_plugin_registry = check.inst_param(
            type_storage_plugin_registry if type_storage_plugin_registry else
            TypeStoragePluginRegistry(types_to_register={}),
            'type_storage_plugin_registry',
            TypeStoragePluginRegistry,
        )

        self._base_dir = os.path.abspath(
            os.path.expanduser(
                check.opt_nonempty_str_param(
                    base_dir, 'base_dir', seven.get_system_temp_directory())))
        check.invariant(
            os.path.isdir(self._base_dir),
            'Could not find a directory at the base_dir supplied to FileSystemIntermediateStore: '
            '{base_dir}'.format(base_dir=self._base_dir),
        )

        object_store = FileSystemObjectStore()

        root = object_store.key_for_paths(
            [self.base_dir, 'dagster', 'runs', run_id, 'files'])

        super(FileSystemIntermediateStore, self).__init__(
            object_store,
            root=root,
            type_storage_plugin_registry=type_storage_plugin_registry)
Exemplo n.º 4
0
    def operator_for_solid(
        cls,
        handle,
        pipeline_name,
        environment_dict,
        mode,
        solid_name,
        step_keys,
        dag,
        dag_id,
        op_kwargs,
    ):
        tmp_dir = op_kwargs.pop('tmp_dir', DOCKER_TEMPDIR)
        host_tmp_dir = op_kwargs.pop('host_tmp_dir', seven.get_system_temp_directory())

        if 'storage' not in environment_dict:
            raise airflow_storage_exception(tmp_dir)

        # black 18.9b0 doesn't support py27-compatible formatting of the below invocation (omitting
        # the trailing comma after **op_kwargs) -- black 19.3b0 supports multiple python versions,
        # but currently doesn't know what to do with from __future__ import print_function -- see
        # https://github.com/ambv/black/issues/768
        # fmt: off
        return DagsterDockerOperator(
            step=solid_name,
            environment_dict=environment_dict,
            dag=dag,
            tmp_dir=tmp_dir,
            pipeline_name=pipeline_name,
            mode=mode,
            step_keys=step_keys,
            task_id=solid_name,
            host_tmp_dir=host_tmp_dir,
            **op_kwargs
        )
Exemplo n.º 5
0
def test_file_system_intermediate_store():
    run_id = str(uuid.uuid4())

    intermediate_store = FileSystemIntermediateStore(run_id=run_id)
    assert intermediate_store.root == os.path.join(
        seven.get_system_temp_directory(), 'dagster', 'runs', run_id, 'files')

    with yield_empty_pipeline_context(run_id=run_id) as context:
        try:
            intermediate_store.set_object(True, context, RuntimeBool.inst(),
                                          ['true'])
            assert intermediate_store.has_object(context, ['true'])
            assert intermediate_store.get_object(context, RuntimeBool.inst(),
                                                 ['true']).obj is True
            assert intermediate_store.uri_for_paths(['true'
                                                     ]).startswith('file:///')
            assert intermediate_store.rm_object(context, ['true']) is None
            assert intermediate_store.rm_object(context, ['true']) is None
            assert intermediate_store.rm_object(context,
                                                ['dslkfhjsdflkjfs']) is None
        finally:
            try:
                shutil.rmtree(intermediate_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 6
0
Arquivo: cli.py Projeto: zuik/dagster
def construct_environment_yaml(preset_name, config, pipeline_name,
                               module_name):
    # Load environment dict from either a preset or yaml file globs
    if preset_name:
        if config:
            raise click.UsageError("Can not use --preset with --config.")

        cli_args = {
            "fn_name": pipeline_name,
            "pipeline_name": pipeline_name,
            "module_name": module_name,
        }
        pipeline = recon_repo_for_cli_args(
            cli_args).get_definition().get_pipeline(pipeline_name)
        run_config = pipeline.get_preset(preset_name).run_config

    else:
        config = list(config)
        run_config = load_yaml_from_glob_list(config) if config else {}

    # If not provided by the user, ensure we have storage location defined
    if "intermediate_storage" not in run_config:
        system_tmp_path = seven.get_system_temp_directory()
        dagster_tmp_path = os.path.join(system_tmp_path, "dagster-airflow",
                                        pipeline_name)
        run_config["intermediate_storage"] = {
            "filesystem": {
                "config": {
                    "base_dir": dagster_tmp_path
                }
            }
        }

    return run_config
Exemplo n.º 7
0
    def __init__(
        self,
        bucket,
        local_dir=None,
        inst_data=None,
        prefix="dagster",
        json_credentials_envvar=None,
    ):
        self._bucket_name = check.str_param(bucket, "bucket")
        self._prefix = check.str_param(prefix, "prefix")

        if json_credentials_envvar:
            json_info_str = os.environ.get(json_credentials_envvar)
            credentials_info = json.loads(json_info_str)
            self._bucket = (storage.Client().from_service_account_info(
                credentials_info).bucket(self._bucket_name))
        else:
            self._bucket = storage.Client().bucket(self._bucket_name)

        # Check if the bucket exists
        check.invariant(self._bucket.exists())

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        self._inst_data = check.opt_inst_param(inst_data, "inst_data",
                                               ConfigurableClassData)
Exemplo n.º 8
0
def test_file_system_intermediate_store_composite_types_with_custom_serializer_for_inner_type(
):
    run_id = str(uuid.uuid4())

    intermediate_store = FileSystemIntermediateStore(run_id=run_id)
    assert intermediate_store.root == os.path.join(
        seven.get_system_temp_directory(), 'dagster', 'runs', run_id, 'files')

    with yield_empty_pipeline_context(run_id=run_id) as context:
        try:
            intermediate_store.set_object(
                ['foo', 'bar'],
                context,
                resolve_to_runtime_type(List[LowercaseString]).inst(),
                ['list'],
            )
            assert intermediate_store.has_object(context, ['list'])
            assert intermediate_store.get_object(
                context,
                resolve_to_runtime_type(List[Bool]).inst(),
                ['list']).obj == ['foo', 'bar']

        finally:
            try:
                shutil.rmtree(intermediate_store.root)
            except seven.FileNotFoundError:
                pass
Exemplo n.º 9
0
def events_jar():
    git_repo_root = six.ensure_str(
        subprocess.check_output(['git', 'rev-parse',
                                 '--show-toplevel']).strip())

    temp_dir = os.path.join(get_system_temp_directory(),
                            'dagster_examples_tests',
                            'event_pipeline_demo_tests')

    mkdir_p(temp_dir)
    dst = os.path.join(temp_dir, 'events.jar')

    if os.path.exists(dst):
        print('events jar already exists, skipping')
    else:
        subprocess.check_call(['sbt', 'events/assembly'],
                              cwd=os.path.join(git_repo_root, 'scala_modules'))

        src = os.path.join(
            git_repo_root,
            'scala_modules',
            'events/target/scala-2.11/events-assembly-0.1.0-SNAPSHOT.jar',
        )
        subprocess.check_call(['cp', src, dst])

    yield dst
Exemplo n.º 10
0
def get_active_repository_data_from_image(image):
    check.str_param(image, 'image')

    with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir:
        output_file_name = "{}.json".format(uuid4())
        run_serialized_container_command(
            image=image,
            command='dagster repository snapshot {output_file}'.format(
                output_file=os.path.join(DEFAULT_INTERNAL_VOLUME,
                                         output_file_name)),
            volumes={
                tmp_dir: {
                    'bind': DEFAULT_INTERNAL_VOLUME,
                    'mode': DEFAULT_MODE
                }
            },
        )

        active_repo_data = _get_active_repo_data(
            os.path.join(tmp_dir, output_file_name))
        if not isinstance(active_repo_data, ActiveRepositoryData):
            raise DagsterInvariantViolationError(
                "Deserialized snapshot is of type {received} must be a ActiveRepositoryData"
                .format(received=type(active_repo_data)))
        return active_repo_data
Exemplo n.º 11
0
    def __init__(
        self,
        bucket,
        local_dir=None,
        inst_data=None,
        prefix='dagster',
        use_ssl=True,
        verify=True,
        verify_cert_path=None,
        endpoint_url=None,
    ):
        _verify = False if not verify else verify_cert_path
        self._s3_session = create_s3_session(use_ssl=use_ssl,
                                             verify=_verify,
                                             endpoint_url=endpoint_url)
        self._s3_bucket = check.str_param(bucket, 'bucket')
        self._s3_prefix = check.str_param(prefix, 'prefix')
        self._download_urls = {}

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        self._inst_data = check.opt_inst_param(inst_data, 'inst_data',
                                               ConfigurableClassData)
Exemplo n.º 12
0
    def conditionally_fail(_):
        if os.path.isfile(
                os.path.join(get_system_temp_directory(),
                             "chained_failure_pipeline_conditionally_fail")):
            raise Exception("blah")

        return "hello"
Exemplo n.º 13
0
def get_papermill_parameters(step_context, inputs, output_log_path):
    check.inst_param(step_context, "step_context", StepExecutionContext)
    check.param_invariant(
        isinstance(step_context.run_config, dict),
        "step_context",
        "StepExecutionContext must have valid run_config",
    )
    check.dict_param(inputs, "inputs", key_type=str)

    run_id = step_context.run_id
    temp_dir = get_system_temp_directory()
    marshal_dir = os.path.normpath(
        os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))
    mkdir_p(marshal_dir)

    if not isinstance(step_context.pipeline, ReconstructablePipeline):
        raise DagstermillError(
            "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "
            "Use the reconstructable() function if executing from python")

    dm_executable_dict = step_context.pipeline.to_dict()

    dm_context_dict = {
        "output_log_path": output_log_path,
        "marshal_dir": marshal_dir,
        "run_config": step_context.run_config,
    }

    dm_solid_handle_kwargs = step_context.solid_handle._asdict()

    parameters = {}

    input_def_dict = step_context.solid_def.input_dict
    for input_name, input_value in inputs.items():
        assert (
            input_name not in RESERVED_INPUT_NAMES
        ), "Dagstermill solids cannot have inputs named {input_name}".format(
            input_name=input_name)
        dagster_type = input_def_dict[input_name].dagster_type
        parameter_value = write_value(
            dagster_type,
            input_value,
            os.path.join(
                marshal_dir,
                f"{str(step_context.solid_handle)}-input-{input_name}"),
        )
        parameters[input_name] = parameter_value

    parameters["__dm_context"] = dm_context_dict
    parameters["__dm_executable_dict"] = dm_executable_dict
    parameters["__dm_pipeline_run_dict"] = pack_value(
        step_context.pipeline_run)
    parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs
    parameters["__dm_instance_ref_dict"] = pack_value(
        step_context.instance.get_ref())

    return parameters
Exemplo n.º 14
0
def open_server_process(
    port,
    socket,
    loadable_target_origin=None,
    max_workers=1,
    heartbeat=False,
    heartbeat_timeout=30,
    lazy_load_user_code=False,
):
    check.invariant((port or socket) and not (port and socket),
                    "Set only port or socket")
    check.opt_inst_param(loadable_target_origin, "loadable_target_origin",
                         LoadableTargetOrigin)
    check.int_param(max_workers, "max_workers")

    output_file = os.path.join(
        get_system_temp_directory(),
        "grpc-server-startup-{uuid}".format(uuid=uuid.uuid4().hex))

    subprocess_args = (
        [
            loadable_target_origin.executable_path if loadable_target_origin
            and loadable_target_origin.executable_path else sys.executable,
            "-m",
            "dagster.grpc",
        ] + (["--port", str(port)] if port else []) +
        (["--socket", socket] if socket else []) +
        ["-n", str(max_workers)] + (["--heartbeat"] if heartbeat else []) +
        (["--heartbeat-timeout", str(heartbeat_timeout)]
         if heartbeat_timeout else []) +
        (["--lazy-load-user-code"] if lazy_load_user_code else []) +
        (["--ipc-output-file", output_file]))

    if loadable_target_origin:
        subprocess_args += ((([
            "-f",
            loadable_target_origin.python_file,
        ] + (["-d", loadable_target_origin.working_directory]
             if loadable_target_origin.working_directory else
             ["--empty-working-directory"]))
                             if loadable_target_origin.python_file else []) +
                            (["-m", loadable_target_origin.module_name]
                             if loadable_target_origin.module_name else []) +
                            (["-a", loadable_target_origin.attribute]
                             if loadable_target_origin.attribute else []))

    server_process = open_ipc_subprocess(subprocess_args)

    try:
        wait_for_grpc_server(output_file)
    except:
        if server_process.poll() is None:
            server_process.terminate()
        raise

    return server_process
Exemplo n.º 15
0
def test_write_configs():
    ec2_config = EC2Config(
        remote_host='foo',
        region='us-west-1',
        security_group_id='sg-12345',
        key_pair_name='foobar',
        key_file_path='/some/path',
        ami_id='ami-12345',
    )

    rds_config = RDSConfig(instance_name='foo',
                           instance_uri='foo-bar.amazonaws.com',
                           password='******')

    # Ensure unique dir for test
    tmp_dir = os.path.join(seven.get_system_temp_directory(), uuid.uuid4().hex)
    outfile = os.path.join(tmp_dir, HOST_CONFIG_FILE)

    ec2_config.save(tmp_dir)
    rds_config.save(tmp_dir)

    with open(outfile) as f:
        record = yaml.load(f)

    ec2_config_dict = record['ec2']
    rds_config_dict = record['rds']

    assert ec2_config_dict['remote_host'] == 'foo'
    assert ec2_config_dict['region'] == 'us-west-1'
    assert ec2_config_dict['security_group_id'] == 'sg-12345'
    assert ec2_config_dict['key_pair_name'] == 'foobar'
    assert ec2_config_dict['key_file_path'] == '/some/path'
    assert ec2_config_dict['ami_id'] == 'ami-12345'
    assert EC2Config.load(tmp_dir) == EC2Config(
        **ec2_config_dict) == ec2_config

    assert rds_config_dict['instance_name'] == 'foo'
    assert rds_config_dict['instance_uri'] == 'foo-bar.amazonaws.com'
    assert rds_config_dict['storage_size_gb'] == 20
    assert rds_config_dict['db_engine'] == 'postgres'
    assert rds_config_dict['db_engine_version'] == '11.5'
    assert RDSConfig.load(tmp_dir) == RDSConfig(**rds_config_dict)

    # Delete both configs
    res = rds_config.delete(tmp_dir)
    assert res
    assert not RDSConfig.exists(tmp_dir)
    res = ec2_config.delete(tmp_dir)
    assert res
    assert not EC2Config.exists(tmp_dir)

    # Try to delete non-existent config
    res = rds_config.delete(tmp_dir)
    assert not res
Exemplo n.º 16
0
    def __init__(self, bucket, local_dir=None, inst_data=None):
        self._s3_session = create_s3_session()
        self._s3_bucket = check.str_param(bucket, 'bucket')
        self._download_urls = {}

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        super(S3ComputeLogManager, self).__init__(inst_data=inst_data)
    def __init__(self, bucket, local_dir=None, inst_data=None, prefix='dagster'):
        self._s3_session = create_s3_session()
        self._s3_bucket = check.str_param(bucket, 'bucket')
        self._s3_prefix = check.str_param(prefix, 'prefix')
        self._download_urls = {}

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        self._inst_data = check.opt_inst_param(inst_data, 'inst_data', ConfigurableClassData)
Exemplo n.º 18
0
def test_ssh_sftp(sftpserver):
    tmp_path = get_system_temp_directory()
    readme_file = os.path.join(tmp_path, "readme.txt")

    @solid(
        config_schema={
            "local_filepath":
            Field(str, is_required=True, description="local file path to get"),
            "remote_filepath":
            Field(str, is_required=True,
                  description="remote file path to get"),
        },
        required_resource_keys={"ssh_resource"},
    )
    def sftp_solid_get(context):
        local_filepath = context.solid_config.get("local_filepath")
        remote_filepath = context.solid_config.get("remote_filepath")
        return context.resources.ssh_resource.sftp_get(remote_filepath,
                                                       local_filepath)

    with sftpserver.serve_content({"a_dir": {"readme.txt": "hello, world"}}):
        result = execute_solid(
            sftp_solid_get,
            ModeDefinition(resource_defs={"ssh_resource": sshresource}),
            run_config={
                "solids": {
                    "sftp_solid_get": {
                        "config": {
                            "local_filepath": readme_file,
                            "remote_filepath": "a_dir/readme.txt",
                        }
                    }
                },
                "resources": {
                    "ssh_resource": {
                        "config": {
                            "remote_host": sftpserver.host,
                            "remote_port": sftpserver.port,
                            "username": "******",
                            "password": "******",
                            "no_host_key_check": True,
                        }
                    }
                },
            },
        )
        assert result.success

    with open(readme_file, "rb") as f:
        contents = f.read()
        assert b"hello, world" in contents
Exemplo n.º 19
0
def test_ssh_sftp(sftpserver):
    tmp_path = get_system_temp_directory()
    readme_file = os.path.join(tmp_path, 'readme.txt')

    @solid(
        config_schema={
            'local_filepath':
            Field(str, is_required=True, description='local file path to get'),
            'remote_filepath':
            Field(str, is_required=True,
                  description='remote file path to get'),
        },
        required_resource_keys={'ssh_resource'},
    )
    def sftp_solid_get(context):
        local_filepath = context.solid_config.get('local_filepath')
        remote_filepath = context.solid_config.get('remote_filepath')
        return context.resources.ssh_resource.sftp_get(remote_filepath,
                                                       local_filepath)

    with sftpserver.serve_content({'a_dir': {'readme.txt': 'hello, world'}}):
        result = execute_solid(
            sftp_solid_get,
            ModeDefinition(resource_defs={'ssh_resource': sshresource}),
            run_config={
                'solids': {
                    'sftp_solid_get': {
                        'config': {
                            'local_filepath': readme_file,
                            'remote_filepath': 'a_dir/readme.txt',
                        }
                    }
                },
                'resources': {
                    'ssh_resource': {
                        'config': {
                            'remote_host': sftpserver.host,
                            'remote_port': sftpserver.port,
                            'username': '******',
                            'password': '******',
                            'no_host_key_check': True,
                        }
                    }
                },
            },
        )
        assert result.success

    with open(readme_file, 'rb') as f:
        contents = f.read()
        assert b'hello, world' in contents
Exemplo n.º 20
0
def get_papermill_parameters(step_context, inputs, output_log_path,
                             compute_descriptor):
    check.inst_param(step_context, "step_context", StepExecutionContext)
    check.param_invariant(
        isinstance(step_context.run_config, dict),
        "step_context",
        "StepExecutionContext must have valid run_config",
    )
    check.dict_param(inputs, "inputs", key_type=str)

    run_id = step_context.run_id
    temp_dir = get_system_temp_directory()
    marshal_dir = os.path.normpath(
        os.path.join(temp_dir, "dagstermill", str(run_id), "marshal"))
    mkdir_p(marshal_dir)

    if not isinstance(step_context.pipeline, ReconstructablePipeline):
        if compute_descriptor == "solid":
            raise DagstermillError(
                "Can't execute a dagstermill solid from a pipeline that is not reconstructable. "
                "Use the reconstructable() function if executing from python")
        else:
            raise DagstermillError(
                "Can't execute a dagstermill op from a job that is not reconstructable. "
                "Use the reconstructable() function if executing from python")

    dm_executable_dict = step_context.pipeline.to_dict()

    dm_context_dict = {
        "output_log_path": output_log_path,
        "marshal_dir": marshal_dir,
        "run_config": step_context.run_config,
    }

    dm_solid_handle_kwargs = step_context.solid_handle._asdict()
    dm_step_key = step_context.step.key

    parameters = {}

    parameters["__dm_context"] = dm_context_dict
    parameters["__dm_executable_dict"] = dm_executable_dict
    parameters["__dm_pipeline_run_dict"] = pack_value(
        step_context.pipeline_run)
    parameters["__dm_solid_handle_kwargs"] = dm_solid_handle_kwargs
    parameters["__dm_instance_ref_dict"] = pack_value(
        step_context.instance.get_ref())
    parameters["__dm_step_key"] = dm_step_key
    parameters["__dm_input_names"] = list(inputs.keys())

    return parameters
Exemplo n.º 21
0
def test_remove_ssh_key():
    # Ensure SSH is running
    subprocess.call(['ssh-agent', '-s'])

    test_key_path = os.path.join(seven.get_system_temp_directory(), 'test.pem')
    with open(test_key_path, 'wb') as f:
        f.write(TEST_PEM_PRIVATE_KEY)

    os.chmod(test_key_path, 0o600)

    subprocess.call(['ssh-add', '-D'])
    assert remove_ssh_key('does/not/matter')

    subprocess.call(['ssh-add', test_key_path])
    assert not remove_ssh_key('/key/does/not/exist.pem')
    assert remove_ssh_key(test_key_path)
Exemplo n.º 22
0
    def __init__(self, dagster_operator_parameters, *args):
        kwargs = dagster_operator_parameters.op_kwargs
        tmp_dir = kwargs.pop("tmp_dir", DOCKER_TEMPDIR)
        host_tmp_dir = kwargs.pop("host_tmp_dir",
                                  seven.get_system_temp_directory())
        self.host_tmp_dir = host_tmp_dir
        self.docker_conn_id_set = kwargs.get("docker_conn_id") is not None
        self.run_config = dagster_operator_parameters.run_config
        self.pipeline_name = dagster_operator_parameters.pipeline_name
        self.pipeline_snapshot = dagster_operator_parameters.pipeline_snapshot
        self.execution_plan_snapshot = dagster_operator_parameters.execution_plan_snapshot
        self.parent_pipeline_snapshot = dagster_operator_parameters.parent_pipeline_snapshot
        self.mode = dagster_operator_parameters.mode
        self.step_keys = dagster_operator_parameters.step_keys
        self.recon_repo = dagster_operator_parameters.recon_repo
        self._run_id = None

        self.instance_ref = dagster_operator_parameters.instance_ref
        check.invariant(self.instance_ref)
        self.instance = DagsterInstance.from_ref(self.instance_ref)

        # These shenanigans are so we can override DockerOperator.get_hook in order to configure
        # a docker client using docker.from_env, rather than messing with the logic of
        # DockerOperator.execute
        if not self.docker_conn_id_set:
            try:
                from_env().version()
            except Exception:
                pass
            else:
                kwargs["docker_conn_id"] = True

        if "environment" not in kwargs:
            kwargs["environment"] = get_aws_environment()

        super(DagsterDockerOperator, self).__init__(
            task_id=dagster_operator_parameters.task_id,
            dag=dagster_operator_parameters.dag,
            tmp_dir=tmp_dir,
            host_tmp_dir=host_tmp_dir,
            xcom_push=True,
            # We do this because log lines won't necessarily be emitted in order (!) -- so we can't
            # just check the last log line to see if it's JSON.
            xcom_all=True,
            *args,
            **kwargs,
        )
Exemplo n.º 23
0
    def __init__(
        self,
        bucket,
        local_dir=None,
        inst_data=None,
        prefix="dagster",
    ):
        self._bucket_name = check.str_param(bucket, "bucket")
        self._prefix = check.str_param(prefix, "prefix")

        self._bucket = storage.Client().get_bucket(self._bucket_name)

        # proxy calls to local compute log manager (for subscriptions, etc)
        if not local_dir:
            local_dir = seven.get_system_temp_directory()

        self.local_manager = LocalComputeLogManager(local_dir)
        self._inst_data = check.opt_inst_param(inst_data, "inst_data", ConfigurableClassData)
Exemplo n.º 24
0
    def __init__(self, run_id, types_to_register=None, base_dir=None):
        self.run_id = check.str_param(run_id, 'run_id')
        self.storage_mode = RunStorageMode.FILESYSTEM
        self._base_dir = os.path.abspath(
            os.path.expanduser(
                check.opt_nonempty_str_param(
                    base_dir, 'base_dir', seven.get_system_temp_directory()
                )
            )
        )
        check.invariant(
            os.path.isdir(self._base_dir),
            'Could not find a directory at the base_dir supplied to FileSystemObjectStore: '
            '{base_dir}'.format(base_dir=self._base_dir),
        )
        self.root = get_run_files_directory(self.base_dir, run_id)

        super(FileSystemObjectStore, self).__init__(types_to_register)
Exemplo n.º 25
0
def remove_ssh_key(key_file_path):
    # We have to clean up after ourselves to avoid "Too many authentication failures" issue.
    Term.waiting('Removing SSH key from authentication agent...')

    # AWS only gives us the private key contents; ssh-add uses the private key for adding but the
    # public key for removing
    try:
        public_keys = six.ensure_str(subprocess.check_output(
            ['ssh-add', '-L'])).strip().split('\n')
    except subprocess.CalledProcessError:
        Term.rewind()
        Term.info('No identities found, skipping')
        return True

    filtered_public_keys = [key for key in public_keys if key_file_path in key]
    public_key = filtered_public_keys[0] if filtered_public_keys else None

    if public_key:
        tmp_pub_file = os.path.join(seven.get_system_temp_directory(),
                                    uuid.uuid4().hex + '-tmp-pubkey')

        with open(tmp_pub_file, 'wb') as f:
            f.write(six.ensure_binary(public_key))

        res = subprocess.Popen(['ssh-add', '-d', tmp_pub_file],
                               stdout=subprocess.PIPE,
                               stderr=subprocess.STDOUT).communicate()
        res = six.ensure_str(res[0])

        os.unlink(tmp_pub_file)

        if 'Identity removed' in res:
            Term.rewind()
            Term.success('key deleted successfully')
            return True
        else:
            Term.warning('Could not remove key, error: %s' % res)
            return False
    else:
        Term.rewind()
        Term.info('key not found, skipping')
        return False

    return True
Exemplo n.º 26
0
def sync_dagster_yaml(ec2_config, rds_config):
    '''Configure Dagster instance to use PG storage by putting a dagster.yaml file in the remote
    DAGSTER_HOME directory
    '''
    with open(os.path.join(os.path.dirname(__file__), 'conf', 'dagster.template.yaml'), 'rb') as f:
        dagster_yaml = six.ensure_str(f.read()).format(
            username=rds_config.username,
            password=rds_config.password,
            hostname=rds_config.instance_uri,
            db_name=rds_config.db_name,
            port=DEFAULT_RDS_PORT,
        )

    tmp_file = os.path.join(seven.get_system_temp_directory(), 'dagster.yaml')

    with open(tmp_file, 'wb') as f:
        f.write(six.ensure_binary(dagster_yaml))

    rsync_to_remote(ec2_config.key_file_path, tmp_file, ec2_config.remote_host, SERVER_DAGSTER_HOME)
Exemplo n.º 27
0
Arquivo: cli.py Projeto: nikie/dagster
def sync_dagster_yaml(ec2_config, rds_config):
    '''Configure Dagster instance to use PG storage by putting a dagster.yaml file in the remote
    DAGSTER_HOME directory
    '''
    with open(os.path.join(os.path.dirname(__file__), 'conf', 'dagster.template.yaml'), 'rb') as f:
        dagster_yaml = six.ensure_str(f.read())

    dagster_yaml = (
        dagster_yaml.replace('{username}', rds_config.username)
        .replace('{password}', rds_config.password)
        .replace('{host}', rds_config.instance_uri)
        .replace('{database}', rds_config.db_name)
    )

    tmp_file = os.path.join(seven.get_system_temp_directory(), 'dagster.yaml')

    with open(tmp_file, 'wb') as f:
        f.write(six.ensure_binary(dagster_yaml))

    rsync_to_remote(ec2_config.key_file_path, tmp_file, ec2_config.remote_host, SERVER_DAGSTER_HOME)
Exemplo n.º 28
0
def execute_pipeline_iterator_from_image(
    image, pipeline_name, environment_dict=None, mode=None, solid_subset=None
):
    # This method currently depends on file mounts, and will not work when executing within
    # a docker container

    check.str_param(image, 'image')
    check.str_param(pipeline_name, 'pipeline_name')
    check.opt_dict_param(environment_dict, 'environment-dict', key_type=str)
    mode = check.opt_str_param(mode, 'mode', DEFAULT_MODE_NAME)
    check.opt_list_param(solid_subset, 'solid-subset', of_type="str")

    if not environment_dict:
        environment_dict = {}

    with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir:
        output_file_name = "{}.json".format(uuid4())

        command = (
            "dagster api execute_pipeline -y repository.yaml {pipeline_name} "
            "{output_file} --environment-dict='{environment_dict}' --mode={mode}".format(
                pipeline_name=pipeline_name,
                output_file=os.path.join(DEFAULT_INTERNAL_VOLUME, output_file_name),
                environment_dict=json.dumps(environment_dict),
                mode=mode,
            )
        )

        if solid_subset:
            command += " --solid_subset={solid_subset}".format(solid_subset=",".join(solid_subset))

        for event in run_detached_container_command(
            image=image,
            command=command,
            volumes={tmp_dir: {'bind': DEFAULT_INTERNAL_VOLUME, 'mode': DEFAULT_MODE}},
            output_file=os.path.join(tmp_dir, output_file_name),
        ):
            yield event
Exemplo n.º 29
0
def get_active_repository_data_from_image(image):
    check.str_param(image, 'image')

    with get_temp_dir(in_directory=get_system_temp_directory()) as tmp_dir:
        output_file_name = "{}.json".format(uuid4())
        command = 'dagster api snapshot repository'.format(
            output_file=os.path.join(DEFAULT_INTERNAL_VOLUME,
                                     output_file_name))
        output = run_serialized_container_command(
            image=image,
            command=command,
            volumes={
                tmp_dir: {
                    'bind': DEFAULT_INTERNAL_VOLUME,
                    'mode': DEFAULT_MODE
                }
            },
        )

        if len(output) != 1:
            print(output)
            raise DagsterInvariantViolationError(
                "Running command {command} in container {image} resulted in output of length "
                "{actual} lines, expected {expected} lines".format(
                    command=command,
                    image=image,
                    actual=len(output),
                    expected=1))

        serialized_active_repo_data = output[0]
        active_repo_data = deserialize_json_to_dagster_namedtuple(
            serialized_active_repo_data)

        if not isinstance(active_repo_data, ActiveRepositoryData):
            raise DagsterInvariantViolationError(
                "Deserialized snapshot is of type {received} must be a ActiveRepositoryData"
                .format(received=type(active_repo_data)))
        return active_repo_data
Exemplo n.º 30
0
def test_file_system_intermediate_store_composite_types():
    run_id = str(uuid.uuid4())

    intermediate_store = FileSystemIntermediateStore(run_id=run_id)
    assert intermediate_store.root == os.path.join(
        seven.get_system_temp_directory(), 'dagster', 'runs', run_id, 'files')

    with yield_empty_pipeline_context(run_id=run_id) as context:
        try:
            intermediate_store.set_object([True, False], context,
                                          resolve_to_runtime_type(
                                              List(Bool)).inst(), ['bool'])
            assert intermediate_store.has_object(context, ['bool'])
            assert intermediate_store.get_object(
                context,
                resolve_to_runtime_type(List(Bool)).inst(),
                ['bool']) == [True, False]

        finally:
            try:
                shutil.rmtree(intermediate_store.root)
            except seven.FileNotFoundError:
                pass