def test_dagster_type_collision():
    class Foo(object):
        pass

    _Foo_1 = as_dagster_type(Foo)
    with pytest.raises(
        DagsterInvalidDefinitionError,
        match='A Dagster runtime type has already been registered for the Python type',
    ):
        _Foo_2 = as_dagster_type(Foo)
def test_make_dagster_type():
    OverwriteNameTuple = as_dagster_type(collections.namedtuple('SomeNamedTuple', 'prop'))
    runtime_type = resolve_to_runtime_type(OverwriteNameTuple)
    assert runtime_type.name == 'SomeNamedTuple'
    assert OverwriteNameTuple(prop='foo').prop == 'foo'

    OverwriteNameTuple = as_dagster_type(
        collections.namedtuple('SomeNamedTuple', 'prop'), name='OverwriteName'
    )
    runtime_type = resolve_to_runtime_type(OverwriteNameTuple)
    assert runtime_type.name == 'OverwriteName'
    assert OverwriteNameTuple(prop='foo').prop == 'foo'
예제 #3
0
    def get_resource_init_input_hydration_pipeline(resources_initted):
        @resource
        def resource_a(_):
            resources_initted['a'] = True
            yield 'A'

        class CustomType(str):
            pass

        @input_hydration_config(String, required_resource_keys={'a'})
        def InputHydration(context, hello):
            assert context.resources.a == 'A'
            return CustomType(hello)

        CustomDagsterType = as_dagster_type(
            CustomType, name='CustomType', input_hydration_config=InputHydration
        )

        @solid(input_defs=[InputDefinition('custom_type', CustomDagsterType)])
        def input_hydration_solid(context, custom_type):
            context.log.info(custom_type)

        @solid(output_defs=[OutputDefinition(CustomDagsterType)])
        def source_custom_type(_):
            return CustomType('from solid')

        @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})])
        def selective_pipeline():
            input_hydration_solid(source_custom_type())

        return selective_pipeline
예제 #4
0
    def define_input_hydration_pipeline(should_require_resources):
        @resource
        def resource_a(_):
            yield 'A'

        class CustomType(str):
            pass

        @input_hydration_config(
            String, required_resource_keys={'a'} if should_require_resources else set()
        )
        def InputHydration(context, hello):
            assert context.resources.a == 'A'
            return CustomType(hello)

        CustomDagsterType = as_dagster_type(
            CustomType, name='CustomType', input_hydration_config=InputHydration
        )

        @solid(input_defs=[InputDefinition('custom_type', CustomDagsterType)])
        def input_hydration_solid(context, custom_type):
            context.log.info(custom_type)

        @pipeline(mode_defs=[ModeDefinition(resource_defs={'a': resource_a})])
        def input_hydration_pipeline():
            input_hydration_solid()

        return input_hydration_pipeline
def test_make_dagster_type_from_builtin():
    OrderedDict = as_dagster_type(collections.OrderedDict)
    assert OrderedDict is collections.OrderedDict
    assert OrderedDict([('foo', 'bar')]) == collections.OrderedDict([('foo',
                                                                      'bar')])
    assert isinstance(resolve_to_runtime_type(OrderedDict), RuntimeType)
    assert resolve_to_runtime_type(
        OrderedDict).python_type is collections.OrderedDict
예제 #6
0
def test_python_built_in_output():
    class MyOrderedDict(collections.OrderedDict):
        pass

    OrderedDict = as_dagster_type(MyOrderedDict)

    @lambda_solid
    def emit_ordered_dict() -> OrderedDict:
        return OrderedDict([('foo', 'bar')])

    output_value = execute_solid(emit_ordered_dict).output_value()
    assert output_value == OrderedDict([('foo', 'bar')])
    assert isinstance(output_value, OrderedDict)
    assert isinstance(output_value, MyOrderedDict)
    assert isinstance(output_value, collections.OrderedDict)
예제 #7
0
from dagster import execute_pipeline, pipeline, as_dagster_type, lambda_solid
import pandas as pd
# Data Validations which check that source and destination files should be in #PandasDataFrame format in all the nodes.

DataFrame = as_dagster_type(
    pd.DataFrame,
    name='PandasDataFrame',
)


def Input1() -> DataFrame:
    r = pd.read_csv('file1.csv')
    return r


@lambda_solid
def Input2() -> DataFrame:  # second node which reads input file ->file2.csv
    r2 = pd.read_csv('file2.csv')
    return r2


@lambda_solid  #Represents third node which merges input from file1 and file2
def Merge(r: DataFrame, r2: DataFrame) -> DataFrame:
    r3 = pd.concat([r, r2], axis=1)
    return r3


@lambda_solid  # Fourth node which contains the output merged file.
def Result_output(y: DataFrame) -> DataFrame:
    y3 = y
    y3.to_csv(r'merged_output.csv')
예제 #8
0
# The MIT License (MIT)
# Copyright (c) 2019 Ian Buttimer

# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:

# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.

# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from dagster import as_dagster_type
from bitarray import bitarray

BitArray = as_dagster_type(
    bitarray,
    name='BitArray',
    description='''bitarray: efficient arrays of booleans.
    See https://pypi.org/project/bitarray/, https://github.com/ilanschnell/bitarray''',
)
예제 #9
0
    check.dict_param(file_options, 'file_options')

    if file_type == 'csv':
        path = file_options['path']
        return pd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return pd.read_parquet(file_options['path'])
    elif file_type == 'table':
        return pd.read_table(file_options['path'])
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type)
        )


DataFrame = as_dagster_type(
    pd.DataFrame,
    name='PandasDataFrame',
    description='''Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/''',
    input_schema=dataframe_input_schema,
    output_schema=dataframe_output_schema,
    metadata_fn=lambda value: TypeCheck(
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'),
            EventMetadataEntry.json({'columns': list(value.columns)}, 'metadata'),
        ]
    ),
)
예제 #10
0
            [OrderedDict(sorted(x.items(), key=lambda x: x[0])) for x in csv.DictReader(fd)]
        )


@output_schema(Path)
def df_output_schema(_context, path, value):
    with open(path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames=value[0].keys())
        writer.writeheader()
        writer.writerows(rowdicts=value)

    return Materialization.file(path)


PoorMansDataFrame = as_dagster_type(
    PoorMansDataFrame_, input_schema=df_input_schema, output_schema=df_output_schema
)


def get_events_of_type(events, event_type):
    return [
        event
        for event in events
        if event.is_dagster_event and event.dagster_event.event_type == event_type
    ]


def test_running():
    run_id = make_new_run_id()
    handle = ExecutionTargetHandle.for_pipeline_fn(define_passing_pipeline)
    pipeline = define_passing_pipeline()
예제 #11
0
    if file_type == 'csv':
        path = file_options['path']
        return pd.read_csv(path, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        return pd.read_parquet(file_options['path'])
    elif file_type == 'table':
        return pd.read_csv(file_options['path'], sep='\t')
    else:
        raise DagsterInvariantViolationError(
            'Unsupported file_type {file_type}'.format(file_type=file_type)
        )


DataFrame = as_dagster_type(
    pd.DataFrame,
    name='PandasDataFrame',
    description='''Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/''',
    input_hydration_config=dataframe_input_schema,
    output_materialization_config=dataframe_output_schema,
    typecheck_metadata_fn=lambda value: TypeCheck(
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count', 'Number of rows in DataFrame'),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({'columns': list(map(str, value.columns))}, 'metadata'),
        ]
    ),
)
예제 #12
0
)
def spark_df_output_schema(_context, file_type, file_options, spark_df):
    if file_type == 'csv':
        spark_df.write.csv(
            file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')
        )
        return file_options['path']
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


SparkDataFrameType = as_dagster_type(
    DataFrame,
    name='SparkDataFrameType',
    description='A Pyspark data frame.',
    storage_plugins={
        RunStorageMode.S3: SparkDataFrameS3StoragePlugin,
        RunStorageMode.FILESYSTEM: SparkDataFrameFilesystemStoragePlugin,
    },
    output_schema=spark_df_output_schema,
)


SqlAlchemyEngineType = as_dagster_type(
    sqlalchemy.engine.Connectable,
    name='SqlAlchemyEngineType',
    description='A SqlAlchemy Connectable',
)


class SqlTableName(Stringish):
    def __init__(self):
예제 #13
0
파일: types.py 프로젝트: yetudada/dagster
"""Type definitions for the airline_demo."""

from collections import namedtuple

import sqlalchemy

from dagster import as_dagster_type
from dagster.core.types.runtime_type import create_string_type

AirlineDemoResources = namedtuple(
    'AirlineDemoResources',
    ('spark', 's3', 'db_url', 'db_engine', 'db_dialect',
     'redshift_s3_temp_dir', 'db_load'),
)

SqlAlchemyEngineType = as_dagster_type(
    sqlalchemy.engine.Connectable,
    name='SqlAlchemyEngineType',
    description='A SqlAlchemy Connectable',
)

SqlTableName = create_string_type('SqlTableName',
                                  description='The name of a database table')
DbInfo = namedtuple('DbInfo',
                    'engine url jdbc_url dialect load_table host db_name')
예제 #14
0
from dagster import execute_pipeline, pipeline, as_dagster_type, lambda_solid, dagster_type
from dagit import *
from graphql import *
from dagster_graphql import *
import pandas as pd

# Data Validations which check that source and destination files should be in #PandasDataFrame format in all the nodes.
DataFrame = as_dagster_type(
    pd.pandas.core.frame.DataFrame,
    name='PandasDataFrame',
)


@lambda_solid  #Defines a node in the workflow.
def Input1() -> DataFrame:  # first node which reads input file -> file1.csv
    r = pd.read_csv('file1.csv')
    return r


@lambda_solid
def Input2() -> DataFrame:  # second node which reads input file ->file2.csv
    r2 = pd.read_csv('file2.csv')
    return r2


@lambda_solid  #Represents third node which merges input from file1 and file2
def Merge(r: DataFrame, r2: DataFrame) -> DataFrame:
    r3 = pd.concat([r, r2], axis=1)
    return r3

예제 #15
0
    )


# Placeholder class to cause the unregistered notebook solid to fail -- custom serialization
# strategies require repository registration
class ComplexSerializationStrategy(SerializationStrategy):  # pylint: disable=no-init
    def serialize(self, value, write_file_obj):
        pass  # pragma: nocover

    def deserialize(self, read_file_obj):
        pass  # pragma: nocover


complex_serialization_strategy = ComplexSerializationStrategy('complex')

ComplexDagsterType = as_dagster_type(
    pd.DataFrame, serialization_strategy=complex_serialization_strategy)


@solid('resource_solid', required_resource_keys={'list'})
def resource_solid(context):
    context.resources.list.append('Hello, solid!')
    return True


@solid_definition
def hello_world_resource_solid():
    return dagstermill.define_dagstermill_solid(
        'hello_world_resource',
        nb_test_path('hello_world_resource'),
        input_defs=[InputDefinition('nonce')],
        required_resource_keys={'list'},
예제 #16
0
                'header': Field(Bool, is_optional=True),
            }))
    }))
def write_rdd(context, file_type, file_options, spark_rdd):
    if file_type == 'csv':
        df = context.resources.spark.spark_session.createDataFrame(spark_rdd)
        context.log.info('DF: {}'.format(df))
        df.write.csv(file_options['path'],
                     header=file_options.get('header'),
                     sep=file_options.get('sep'))
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


SparkRDD = as_dagster_type(RDD,
                           'SparkRDD',
                           input_hydration_config=load_rdd,
                           output_materialization_config=write_rdd)


@output_selector_schema(
    Selector({
        'csv':
        Field(
            Dict({
                'path': Field(Path),
                'sep': Field(String, is_optional=True),
                'header': Field(Bool, is_optional=True),
            }))
    }))
def spark_df_output_schema(_context, file_type, file_options, spark_df):
    if file_type == 'csv':
예제 #17
0
파일: __init__.py 프로젝트: zorrock/dagster
            )
        }
    )
)
def write_rdd(context, file_type, file_options, spark_rdd):
    if file_type == 'csv':
        df = context.resources.spark.createDataFrame(spark_rdd)
        context.log.info('DF: {}'.format(df))
        df.write.csv(
            file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')
        )
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


SparkRDD = as_dagster_type(RDD, 'SparkRDD', input_schema=load_rdd, output_schema=write_rdd)


@resource(config_field=Field(Dict({'spark_conf': spark_config()})))
def spark_session_resource(init_context):
    builder = SparkSession.builder
    flat = flatten_dict(init_context.resource_config['spark_conf'])
    for key, value in flat:
        builder = builder.config(key, value)

    spark = builder.getOrCreate()
    try:
        yield spark
    finally:
        spark.stop()
# pylint: disable=no-value-for-parameter

import collections

from dagster import Any, Field, as_dagster_type, pipeline, solid

Counter = as_dagster_type(collections.Counter)


@solid(config_field=Field(Any))
def multiply_the_word(context, word: str) -> str:
    return word * context.solid_config['factor']


@solid
def count_letters(_, word: str) -> Counter:
    return collections.Counter(word)


@pipeline
def configuration_schema_pipeline():
    return count_letters(multiply_the_word())
예제 #19
0
        success=True,
        metadata_entries=[
            EventMetadataEntry.text(str(len(value)), 'row_count',
                                    'Number of rows in DataFrame'),
            # string cast columns since they may be things like datetime
            EventMetadataEntry.json({'columns': list(map(str, value.columns))},
                                    'metadata'),
        ],
    )


DataFrame = as_dagster_type(
    pd.DataFrame,
    name='PandasDataFrame',
    description='''Two-dimensional size-mutable, potentially heterogeneous
    tabular data structure with labeled axes (rows and columns).
    See http://pandas.pydata.org/''',
    input_hydration_config=dataframe_input_schema,
    output_materialization_config=dataframe_output_schema,
    type_check=df_type_check,
)


def _construct_constraint_list(constraints):
    def add_bullet(constraint_list, constraint_description):
        return constraint_list + "+ {constraint_description}\n".format(
            constraint_description=constraint_description)

    constraint_list = ""
    for constraint in constraints:
        if constraint.__class__ not in CONSTRAINT_BLACKLIST:
            constraint_list = add_bullet(constraint_list,
예제 #20
0
    Field,
    Output,
    OutputDefinition,
    String,
    as_dagster_type,
    execute_pipeline,
    pipeline,
    solid,
)


class _DataFrame(list):
    pass


DataFrame = as_dagster_type(_DataFrame, name='DataFrame')


@solid
def read_csv(context, csv_path):
    with open(csv_path, 'r') as fd:
        lines = [row for row in csv.DictReader(fd)]

    context.log.info('Read {n_lines} lines'.format(n_lines=len(lines)))
    return DataFrame(lines)


@solid(
    config={
        'process_hot': Field(Bool, is_optional=True, default_value=True),
        'process_cold': Field(Bool, is_optional=True, default_value=True),
예제 #21
0
파일: setup.py 프로젝트: jmbrooks/dagster
        ])


@output_materialization_config(Path)
def df_output_schema(_context, path, value):
    with open(path, 'w') as fd:
        writer = csv.DictWriter(fd, fieldnames=value[0].keys())
        writer.writeheader()
        writer.writerows(rowdicts=value)

    return Materialization.file(path)


PoorMansDataFrame = as_dagster_type(
    PoorMansDataFrame_,
    input_hydration_config=df_input_schema,
    output_materialization_config=df_output_schema,
)


def define_test_subprocess_context(instance):
    return define_subprocess_context_for_file(__file__, "define_repository",
                                              instance)


def define_test_context(instance=None):
    return define_context_for_file(__file__, "define_repository", instance)


@lambda_solid(
    input_defs=[InputDefinition('num', PoorMansDataFrame)],
예제 #22
0
import sqlalchemy

from pyspark.sql import DataFrame

from dagster import dagster_type, as_dagster_type
from dagster.core.types.runtime import PythonObjectType, Stringish
from dagster.utils import safe_isfile

AirlineDemoResources = namedtuple(
    'AirlineDemoResources',
    ('spark', 's3', 'db_url', 'db_engine', 'db_dialect',
     'redshift_s3_temp_dir', 'db_load'),
)

SparkDataFrameType = as_dagster_type(DataFrame,
                                     name='SparkDataFrameType',
                                     description='A Pyspark data frame.')

SqlAlchemyEngineType = as_dagster_type(
    sqlalchemy.engine.Connectable,
    name='SqlAlchemyEngineType',
    description='A SqlAlchemy Connectable',
)


class SqlTableName(Stringish):
    def __init__(self):
        super(SqlTableName,
              self).__init__(description='The name of a database table')