Exemplo n.º 1
0
def define_builtin_scalar_output_schema(scalar_name):
    check.str_param(scalar_name, 'scalar_name')

    schema_cls = Selector({
        'json': define_path_dict_field(),
        'pickle': define_path_dict_field()
    })

    @dagster_type_materializer(schema_cls)
    def _buildint_materializer(_context, config_value, runtime_value):
        from dagster.core.events import Materialization

        file_type, file_options = list(config_value.items())[0]

        if file_type == 'json':
            json_file_path = file_options['path']
            json_value = seven.json.dumps({'value': runtime_value})
            with open(json_file_path, 'w') as ff:
                ff.write(json_value)
            return Materialization.file(json_file_path)
        elif file_type == 'pickle':
            pickle_file_path = file_options['path']
            with open(pickle_file_path, 'wb') as ff:
                pickle.dump(runtime_value, ff)
            return Materialization.file(pickle_file_path)
        else:
            check.failed('Unsupported file type: {file_type}'.format(
                file_type=file_type))

    return _buildint_materializer
def define_builtin_scalar_output_schema(scalar_name):
    check.str_param(scalar_name, "scalar_name")

    schema_cls = Selector({
        "json": define_path_dict_field(),
        "pickle": define_path_dict_field()
    })

    @dagster_type_materializer(schema_cls)
    def _buildint_materializer(_context, config_value, runtime_value):
        from dagster.core.events import AssetMaterialization

        file_type, file_options = list(config_value.items())[0]

        if file_type == "json":
            json_file_path = file_options["path"]
            json_value = seven.json.dumps({"value": runtime_value})
            with open(json_file_path, "w") as ff:
                ff.write(json_value)
            return AssetMaterialization.file(json_file_path)
        elif file_type == "pickle":
            pickle_file_path = file_options["path"]
            with open(pickle_file_path, "wb") as ff:
                pickle.dump(runtime_value, ff)
            return AssetMaterialization.file(pickle_file_path)
        else:
            check.failed("Unsupported file type: {file_type}".format(
                file_type=file_type))

    return _buildint_materializer
def define_typed_input_schema_dict(value_config_type):
    check.inst_param(value_config_type, "value_config_type", ConfigType)
    return Selector(
        {
            "value": Field(value_config_type),
            "json": define_path_dict_field(),
            "pickle": define_path_dict_field(),
        }, )
Exemplo n.º 4
0
def define_typed_input_schema_dict(value_config_type):
    check.inst_param(value_config_type, 'value_config_type', ConfigType)
    return Selector(
        {
            'value': Field(value_config_type),
            'json': define_path_dict_field(),
            'pickle': define_path_dict_field(),
        }, )
Exemplo n.º 5
0
def test_post_process_config():
    scalar_config_type = resolve_to_config_type(String)
    assert post_process_config(scalar_config_type, 'foo').value == 'foo'
    assert post_process_config(scalar_config_type, 3).value == 3
    assert post_process_config(scalar_config_type, {}).value == {}
    assert post_process_config(scalar_config_type, None).value is None

    enum_config_type = resolve_to_config_type(
        Enum('an_enum', [EnumValue('foo'),
                         EnumValue('bar', python_value=3)]))
    assert post_process_config(enum_config_type, 'foo').value == 'foo'
    assert post_process_config(enum_config_type, 'bar').value == 3
    assert ('config_value should be pre-validated'
            in post_process_config(enum_config_type, 'baz').errors[0].message)
    assert ('config_value should be pre-validated'
            in post_process_config(enum_config_type, None).errors[0].message)
    list_config_type = resolve_to_config_type([str])

    assert post_process_config(list_config_type, ['foo']).value == ['foo']
    assert post_process_config(list_config_type, None).value == []
    with pytest.raises(CheckError, match='Null array member not caught'):
        assert post_process_config(list_config_type, [None]).value == [None]

    nullable_list_config_type = resolve_to_config_type([Noneable(str)])
    assert post_process_config(nullable_list_config_type,
                               ['foo']).value == ['foo']
    assert post_process_config(nullable_list_config_type,
                               [None]).value == [None]
    assert post_process_config(nullable_list_config_type, None).value == []

    composite_config_type = resolve_to_config_type({
        'foo':
        String,
        'bar': {
            'baz': [str]
        },
        'quux':
        Field(str, is_required=False, default_value='zip'),
        'quiggle':
        Field(str, is_required=False),
    })

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(composite_config_type, {})

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(composite_config_type, {
            'bar': {
                'baz': ['giraffe']
            },
            'quux': 'nimble'
        })

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(composite_config_type, {
            'foo': 'zowie',
            'quux': 'nimble'
        })

    assert post_process_config(composite_config_type, {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        },
        'quux': 'nimble'
    }).value == {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        },
        'quux': 'nimble'
    }

    assert post_process_config(composite_config_type, {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        }
    }).value == {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        },
        'quux': 'zip'
    }

    assert post_process_config(composite_config_type, {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        },
        'quiggle': 'squiggle'
    }).value == {
        'foo': 'zowie',
        'bar': {
            'baz': ['giraffe']
        },
        'quux': 'zip',
        'quiggle': 'squiggle'
    }

    nested_composite_config_type = resolve_to_config_type({
        'fruts': {
            'apple': Field(String),
            'banana': Field(String, is_required=False),
            'potato': Field(String, is_required=False, default_value='pie'),
        }
    })

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(nested_composite_config_type, {'fruts': None})

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(nested_composite_config_type,
                            {'fruts': {
                                'banana': 'good',
                                'potato': 'bad'
                            }})

    assert post_process_config(nested_composite_config_type, {
        'fruts': {
            'apple': 'strawberry'
        }
    }).value == {
        'fruts': {
            'apple': 'strawberry',
            'potato': 'pie'
        }
    }

    assert post_process_config(nested_composite_config_type, {
        'fruts': {
            'apple': 'a',
            'banana': 'b',
            'potato': 'c'
        }
    }).value == {
        'fruts': {
            'apple': 'a',
            'banana': 'b',
            'potato': 'c'
        }
    }

    any_config_type = resolve_to_config_type(Any)

    assert post_process_config(any_config_type, {
        'foo': 'bar'
    }).value == {
        'foo': 'bar'
    }

    assert post_process_config(
        ConfigType('gargle', given_name='bargle', kind=ConfigTypeKind.ANY), 3)

    selector_config_type = resolve_to_config_type(
        Selector({
            'one':
            Field(String),
            'another': {
                'foo': Field(String, default_value='bar', is_required=False)
            },
            'yet_another':
            Field(String, default_value='quux', is_required=False),
        }))

    with pytest.raises(CheckError):
        post_process_config(selector_config_type, 'one')

    with pytest.raises(ParameterCheckError):
        post_process_config(selector_config_type, None)

    with pytest.raises(ParameterCheckError,
                       match='Expected dict with single item'):
        post_process_config(selector_config_type, {})

    with pytest.raises(CheckError):
        post_process_config(selector_config_type, {
            'one': 'foo',
            'another': 'bar'
        })

    assert post_process_config(selector_config_type, {
        'one': 'foo'
    }).value == {
        'one': 'foo'
    }

    assert post_process_config(selector_config_type, {
        'one': None
    }).value == {
        'one': None
    }

    assert post_process_config(selector_config_type, {
        'one': {}
    }).value == {
        'one': {}
    }

    assert post_process_config(selector_config_type, {
        'another': {}
    }).value == {
        'another': {
            'foo': 'bar'
        }
    }

    singleton_selector_config_type = resolve_to_config_type(
        Selector(
            {'foo': Field(String, default_value='bar', is_required=False)}))

    assert post_process_config(singleton_selector_config_type, None).value == {
        'foo': 'bar'
    }

    permissive_dict_config_type = resolve_to_config_type(
        Permissive({
            'foo': Field(String),
            'bar': Field(String, default_value='baz', is_required=False)
        }))

    with pytest.raises(CheckError,
                       match='Missing non-optional composite member'):
        post_process_config(permissive_dict_config_type, None)

    assert post_process_config(permissive_dict_config_type, {
        'foo': 'wow',
        'mau': 'mau'
    }).value == {
        'foo': 'wow',
        'bar': 'baz',
        'mau': 'mau',
    }
Exemplo n.º 6
0
)
from dagster_pandas.validation import PandasColumn, validate_constraints

CONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}


@dagster_type_materializer(
    Selector(
        {
            "csv": {
                "path": StringSource,
                "sep": Field(
                    StringSource, is_required=False, default_value=","),
            },
            "parquet": {
                "path": StringSource
            },
            "table": {
                "path": StringSource
            },
            "pickle": {
                "path": StringSource
            },
        }, ))
def dataframe_materializer(_context, config, pandas_df):
    check.inst_param(pandas_df, "pandas_df", pd.DataFrame)
    file_type, file_options = list(config.items())[0]

    if file_type == "csv":
        path = file_options["path"]
        pandas_df.to_csv(path,
Exemplo n.º 7
0
from pyspark.sql import DataFrame as NativeSparkDataFrame

from dagster import Bool, Field, Materialization, PythonObjectDagsterType, String, check
from dagster.config.field_utils import Selector
from dagster.core.storage.system_storage import fs_system_storage
from dagster.core.storage.type_storage import TypeStoragePlugin
from dagster.core.types.config_schema import output_selector_schema


@output_selector_schema(
    Selector(
        {
            'csv': {
                'path': Field(String),
                'sep': Field(String, is_required=False),
                'header': Field(Bool, is_required=False),
            },
        }
    )
)
def spark_df_output_schema(_context, file_type, file_options, spark_df):
    if file_type == 'csv':
        spark_df.write.csv(
            file_options['path'], header=file_options.get('header'), sep=file_options.get('sep')
        )
        return Materialization.file(file_options['path'])
    else:
        check.failed('Unsupported file type: {}'.format(file_type))

Exemplo n.º 8
0
from dagster import Bool, Field, Materialization, Path, String, as_dagster_type, check, resource
from dagster.config.field_utils import Selector
from dagster.core.storage.system_storage import fs_system_storage
from dagster.core.storage.type_storage import TypeStoragePlugin
from dagster.core.types.config_schema import input_selector_schema, output_selector_schema

from .decorators import pyspark_solid
from .resources import PySparkResourceDefinition, pyspark_resource, spark_session_from_config


@input_selector_schema(
    Selector(
        {
            'csv': {
                'path': Field(Path),
                'sep': Field(String, is_optional=True),
                'header': Field(Bool, is_optional=True),
            }
        }
    )
)
def load_rdd(context, file_type, file_options):
    if file_type == 'csv':
        return context.resources.spark.spark_session.read.csv(
            file_options['path'], sep=file_options.get('sep')
        ).rdd
    else:
        check.failed('Unsupported file type: {}'.format(file_type))


@output_selector_schema(
Exemplo n.º 9
0
def test_post_process_config():
    scalar_config_type = resolve_to_config_type(String)
    assert post_process_config(scalar_config_type, "foo").value == "foo"
    assert post_process_config(scalar_config_type, 3).value == 3
    assert post_process_config(scalar_config_type, {}).value == {}
    assert post_process_config(scalar_config_type, None).value is None

    enum_config_type = resolve_to_config_type(
        Enum("an_enum", [EnumValue("foo"),
                         EnumValue("bar", python_value=3)]))
    assert post_process_config(enum_config_type, "foo").value == "foo"
    assert post_process_config(enum_config_type, "bar").value == 3
    with pytest.raises(CheckError):
        post_process_config(enum_config_type, "baz")
    with pytest.raises(CheckError):
        post_process_config(enum_config_type, None)

    list_config_type = resolve_to_config_type([str])
    assert post_process_config(list_config_type, ["foo"]).value == ["foo"]
    assert post_process_config(list_config_type, None).value == []
    with pytest.raises(CheckError, match="Null array member not caught"):
        assert post_process_config(list_config_type, [None]).value == [None]

    nullable_list_config_type = resolve_to_config_type([Noneable(str)])
    assert post_process_config(nullable_list_config_type,
                               ["foo"]).value == ["foo"]
    assert post_process_config(nullable_list_config_type,
                               [None]).value == [None]
    assert post_process_config(nullable_list_config_type, None).value == []

    map_config_type = resolve_to_config_type({str: int})
    assert post_process_config(map_config_type, {"foo": 5}).value == {"foo": 5}
    assert post_process_config(map_config_type, None).value == {}
    with pytest.raises(CheckError, match="Null map member not caught"):
        assert post_process_config(map_config_type, {
            "foo": None
        }).value == {
            "foo": None
        }

    nullable_map_config_type = resolve_to_config_type({str: Noneable(int)})
    assert post_process_config(nullable_map_config_type, {
        "foo": 5
    }).value == {
        "foo": 5
    }
    assert post_process_config(nullable_map_config_type, {
        "foo": None
    }).value == {
        "foo": None
    }
    assert post_process_config(nullable_map_config_type, None).value == {}

    composite_config_type = resolve_to_config_type({
        "foo":
        String,
        "bar": {
            "baz": [str]
        },
        "quux":
        Field(str, is_required=False, default_value="zip"),
        "quiggle":
        Field(str, is_required=False),
        "werty":
        Field({str: [int]}, is_required=False),
    })
    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(composite_config_type, {})

    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(composite_config_type, {
            "bar": {
                "baz": ["giraffe"]
            },
            "quux": "nimble"
        })

    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(composite_config_type, {
            "foo": "zowie",
            "quux": "nimble"
        })

    assert post_process_config(composite_config_type, {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quux": "nimble"
    }).value == {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quux": "nimble"
    }

    assert post_process_config(composite_config_type, {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        }
    }).value == {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quux": "zip"
    }

    assert post_process_config(composite_config_type, {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quiggle": "squiggle"
    }).value == {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quux": "zip",
        "quiggle": "squiggle"
    }

    assert post_process_config(
        composite_config_type,
        {
            "foo": "zowie",
            "bar": {
                "baz": ["giraffe"]
            },
            "quiggle": "squiggle",
            "werty": {
                "asdf": [1, 2, 3]
            },
        },
    ).value == {
        "foo": "zowie",
        "bar": {
            "baz": ["giraffe"]
        },
        "quux": "zip",
        "quiggle": "squiggle",
        "werty": {
            "asdf": [1, 2, 3]
        },
    }

    nested_composite_config_type = resolve_to_config_type({
        "fruts": {
            "apple": Field(String),
            "banana": Field(String, is_required=False),
            "potato": Field(String, is_required=False, default_value="pie"),
        }
    })

    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(nested_composite_config_type, {"fruts": None})

    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(nested_composite_config_type,
                            {"fruts": {
                                "banana": "good",
                                "potato": "bad"
                            }})

    assert post_process_config(nested_composite_config_type, {
        "fruts": {
            "apple": "strawberry"
        }
    }).value == {
        "fruts": {
            "apple": "strawberry",
            "potato": "pie"
        }
    }

    assert post_process_config(nested_composite_config_type, {
        "fruts": {
            "apple": "a",
            "banana": "b",
            "potato": "c"
        }
    }).value == {
        "fruts": {
            "apple": "a",
            "banana": "b",
            "potato": "c"
        }
    }

    any_config_type = resolve_to_config_type(Any)

    assert post_process_config(any_config_type, {
        "foo": "bar"
    }).value == {
        "foo": "bar"
    }

    assert post_process_config(
        ConfigType("gargle", given_name="bargle", kind=ConfigTypeKind.ANY), 3)

    selector_config_type = resolve_to_config_type(
        Selector({
            "one":
            Field(String),
            "another": {
                "foo": Field(String, default_value="bar", is_required=False)
            },
            "yet_another":
            Field(String, default_value="quux", is_required=False),
        }))

    with pytest.raises(CheckError):
        post_process_config(selector_config_type, "one")

    with pytest.raises(ParameterCheckError):
        post_process_config(selector_config_type, None)

    with pytest.raises(ParameterCheckError,
                       match="Expected dict with single item"):
        post_process_config(selector_config_type, {})

    with pytest.raises(CheckError):
        post_process_config(selector_config_type, {
            "one": "foo",
            "another": "bar"
        })

    assert post_process_config(selector_config_type, {
        "one": "foo"
    }).value == {
        "one": "foo"
    }

    assert post_process_config(selector_config_type, {
        "one": None
    }).value == {
        "one": None
    }

    assert post_process_config(selector_config_type, {
        "one": {}
    }).value == {
        "one": {}
    }

    assert post_process_config(selector_config_type, {
        "another": {}
    }).value == {
        "another": {
            "foo": "bar"
        }
    }

    singleton_selector_config_type = resolve_to_config_type(
        Selector(
            {"foo": Field(String, default_value="bar", is_required=False)}))

    assert post_process_config(singleton_selector_config_type, None).value == {
        "foo": "bar"
    }

    permissive_dict_config_type = resolve_to_config_type(
        Permissive({
            "foo": Field(String),
            "bar": Field(String, default_value="baz", is_required=False)
        }))

    with pytest.raises(CheckError, match="Missing required composite member"):
        post_process_config(permissive_dict_config_type, None)

    assert post_process_config(permissive_dict_config_type, {
        "foo": "wow",
        "mau": "mau"
    }).value == {
        "foo": "wow",
        "bar": "baz",
        "mau": "mau",
    }

    noneable_permissive_config_type = resolve_to_config_type({
        "args":
        Field(Noneable(Permissive()), is_required=False, default_value=None)
    })
    assert post_process_config(noneable_permissive_config_type, {
        "args": {
            "foo": "wow",
            "mau": "mau"
        }
    }).value["args"] == {
        "foo": "wow",
        "mau": "mau",
    }
    assert post_process_config(noneable_permissive_config_type, {
        "args": {}
    }).value["args"] == {}
    assert post_process_config(noneable_permissive_config_type,
                               None).value["args"] == None
Exemplo n.º 10
0
 Selector(
     {
         'csv': Permissive(
             {
                 'path': Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'compression': Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file.",
                 ),
                 'sep': Field(
                     String,
                     is_required=False,
                     description="sets a single character as a separator for each field and value. If None is set, it uses the default value, ``,``.",
                 ),
                 'quote': Field(
                     String,
                     is_required=False,
                     description="""sets a single character used for escaping quoted values where the separator can be part of the value. If None is set, it uses the default value, ``"``. If an empty string is set, it uses ``u0000`` (null character).""",
                 ),
                 'escape': Field(
                     String,
                     is_required=False,
                     description="sets a single character used for escaping quotes inside an already quoted value. If None is set, it uses the default value, ``\\``.",
                 ),
                 'escapeQuotes': Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether values containing quotes should always be enclosed in quotes. If None is set, it uses the default value ``true``, escaping all values containing a quote character.",
                 ),
                 'quoteAll': Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether all values should always be enclosed in quotes. If None is set, it uses the default value ``false``, only escaping values containing a quote character.",
                 ),
                 'header': Field(
                     Bool,
                     is_required=False,
                     description="writes the names of columns as the first line. If None is set, it uses the default value, ``false``.",
                 ),
                 'nullValue': Field(
                     String,
                     is_required=False,
                     description="sets the string representation of a null value. If None is set, it uses the default value, empty string.",
                 ),
                 'dateFormat': Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a date format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to date type. If None is set, it uses the default value, ``yyyy-MM-dd``.",
                 ),
                 'timestampFormat': Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a timestamp format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to timestamp type. If None is set, it uses the default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.",
                 ),
                 'ignoreLeadingWhiteSpace': Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether or not leading whitespaces from values being written should be skipped. If None is set, it uses the default value, ``true``.",
                 ),
                 'ignoreTrailingWhiteSpace': Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether or not trailing whitespaces from values being written should be skipped. If None is set, it uses the default value, ``true``.",
                 ),
                 'charToEscapeQuoteEscaping': Field(
                     String,
                     is_required=False,
                     description="sets a single character used for escaping the escape for the quote character. If None is set, the default value is escape character when escape and quote characters are different, ``\0`` otherwise..",
                 ),
                 'encoding': Field(
                     String,
                     is_required=False,
                     description="sets the encoding (charset) of saved csv files. If None is set, the default UTF-8 charset will be used.",
                 ),
                 'emptyValue': Field(
                     String,
                     is_required=False,
                     description="sets the string representation of an empty value. If None is set, it uses the default value, ``"
                     "``.",
                 ),
             }
         ),
         'parquet': Permissive(
             {
                 'path': Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'partitionBy': Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 'compression': Field(
                     WriteCompressionParquetOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``spark.sql.parquet.compression.codec``. If None is set, it uses the value specified in ``spark.sql.parquet.compression.codec``.",
                 ),
             }
         ),
         'json': Permissive(
             {
                 'path': Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'compression': Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file.",
                 ),
                 'dateFormat': Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a date format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to date type. If None is set, it uses the default value, ``yyyy-MM-dd``.",
                 ),
                 'timestampFormat': Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a timestamp format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to timestamp type. If None is set, it uses the default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.",
                 ),
                 'encoding': Field(
                     String,
                     is_required=False,
                     description="sets the encoding (charset) of saved csv files. If None is set, the default UTF-8 charset will be used.",
                 ),
                 'lineSep': Field(
                     String,
                     is_required=False,
                     description="defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``.",
                 ),
             }
         ),
         'jdbc': Permissive(
             {
                 'url': Field(
                     String,
                     is_required=True,
                     description="a JDBC URL of the form ``jdbc:subprotocol:subname``.",
                 ),
                 'table': Field(
                     String,
                     is_required=True,
                     description="Name of the table in the external database.",
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'properties': Field(
                     Permissive(),
                     is_required=False,
                     description="""a dictionary of JDBC database connection arguments. Normally at least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }.""",
                 ),
             }
         ),
         'orc': Permissive(
             {
                 'path': Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'partitionBy': Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 'compression': Field(
                     WriteCompressionOrcOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``orc.compress`` and ``spark.sql.orc.compression.codec``. If None is set, it uses the value specified in ``spark.sql.orc.compression.codec``.",
                 ),
             }
         ),
         'saveAsTable': Permissive(
             {
                 'name': Field(String, is_required=True, description="the table name."),
                 'format': Field(
                     String, is_required=False, description="the format used to save."
                 ),
                 'mode': Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 'partitionBy': Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 'options': Field(
                     Permissive(), is_required=False, description="all other string options."
                 ),
             }
         ),
         'text': Permissive(
             {
                 'path': Field(
                     String,
                     is_required=True,
                     description="he path in any Hadoop supported file system.",
                 ),
                 'compression': Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``orc.compress`` and ``spark.sql.orc.compression.codec``. If None is set, it uses the value specified in ``spark.sql.orc.compression.codec``.",
                 ),
                 'lineSep': Field(
                     String,
                     is_required=False,
                     description="defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``.",
                 ),
             }
         ),
     }
 )
Exemplo n.º 11
0
CONSTRAINT_BLACKLIST = {ColumnExistsConstraint, ColumnTypeConstraint}


def dict_without_keys(ddict, *keys):
    return {key: value for key, value in ddict.items() if key not in set(keys)}


@output_selector_schema(
    Selector(
        {
            'csv': {
                'path': Path,
                'sep': Field(String, is_optional=True, default_value=','),
            },
            'parquet': {
                'path': Path
            },
            'table': {
                'path': Path
            },
        }, ))
def dataframe_output_schema(_context, file_type, file_options, pandas_df):
    check.str_param(file_type, 'file_type')
    check.dict_param(file_options, 'file_options')
    check.inst_param(pandas_df, 'pandas_df', DataFrame)

    if file_type == 'csv':
        path = file_options['path']
        pandas_df.to_csv(path,
                         index=False,
Exemplo n.º 12
0
 Selector(
     {
         "csv":
         Permissive({
             "path":
             Field(
                 Any,
                 is_required=True,
                 description=
                 "str or list, Path glob indicating the naming scheme for the output files",
             ),
             "single_file":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         Whether to save everything into a single CSV file.
                         Under the single file mode, each partition is appended at the end of the specified CSV file.
                         Note that not all filesystems support the append mode and thus the single file mode,
                         especially on cloud storage systems such as S3 or GCS.
                         A warning will be issued when writing to a file that is not backed by a local filesystem.
                     """,
             ),
             "encoding":
             Field(
                 String,
                 is_required=False,
                 description="""
                         A string representing the encoding to use in the output file,
                         defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
                     """,
             ),
             "mode":
             Field(
                 String,
                 is_required=False,
                 description="Python write mode, default 'w'",
             ),
             "compression":
             Field(
                 WriteCompressionTextOptions,
                 is_required=False,
                 description="""
                         a string representing the compression to use in the output file,
                         allowed values are 'gzip', 'bz2', 'xz'.
                     """,
             ),
             "compute":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         If true, immediately executes.
                         If False, returns a set of delayed objects, which can be computed at a later time.
                     """,
             ),
             "storage_options":
             Field(
                 Permissive(),
                 is_required=False,
                 description=
                 "Parameters passed on to the backend filesystem class.",
             ),
             "header_first_partition_only":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         If set to `True`, only write the header row in the first output file.
                         By default, headers are written to all partitions
                         under the multiple file mode (`single_file` is `False`)
                         and written only once under the single file mode (`single_file` is `True`).
                         It must not be `False` under the single file mode.
                     """,
             ),
             "compute_kwargs":
             Field(
                 Permissive(),
                 is_required=False,
                 description="Options to be passed in to the compute method",
             ),
         }),
         "parquet":
         Permissive({
             "path":
             Field(
                 Any,
                 is_required=True,
                 description="""
                         str or pathlib.Path, Destination directory for data.
                         Prepend with protocol like ``s3://`` or ``hdfs://`` for remote data.
                     """,
             ),
             "engine":
             Field(
                 EngineParquetOptions,
                 is_required=False,
                 description="""
                         {'auto', 'fastparquet', 'pyarrow'}, default 'auto' Parquet library to use.
                         If only one library is installed, it will use that one; if both, it will use 'fastparquet'.
                     """,
             ),
             "compression":
             Field(
                 Any,
                 is_required=False,
                 description="""
                     str or dict, optional Either a string like ``'snappy'``
                     or a dictionary mapping column names to compressors like ``{'name': 'gzip', 'values': 'snappy'}``.
                     The default is ``'default'``, which uses the default compression for whichever engine is selected.
                     """,
             ),
             "write_index":
             Field(
                 Bool,
                 is_required=False,
                 description=
                 "Whether or not to write the index. Defaults to True.",
             ),
             "append":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         If False (default), construct data-set from scratch.
                         If True, add new row-group(s) to an existing data-set.
                         In the latter case, the data-set must exist, and the schema must match the input data.
                     """,
             ),
             "ignore_divisions":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         If False (default) raises error when previous divisions overlap with the new appended divisions.
                         Ignored if append=False.
                     """,
             ),
             "partition_on":
             Field(
                 list,
                 is_required=False,
                 description="""
                         Construct directory-based partitioning by splitting on these fields values.
                         Each dask partition will result in one or more datafiles, there will be no global groupby.
                     """,
             ),
             "storage_options":
             Field(
                 Permissive(),
                 is_required=False,
                 description=
                 "Key/value pairs to be passed on to the file-system backend, if any.",
             ),
             "write_metadata_file":
             Field(
                 Bool,
                 is_required=False,
                 description=
                 "Whether to write the special '_metadata' file.",
             ),
             "compute":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         If True (default) then the result is computed immediately.
                         If False then a ``dask.delayed`` object is returned for future computation.
                     """,
             ),
             "compute_kwargs":
             Field(
                 Permissive(),
                 is_required=False,
                 description=
                 "Options to be passed in to the compute method.",
             ),
         }),
         "hdf":
         Permissive({
             "path":
             Field(
                 Any,
                 is_required=True,
                 description="""
                         str or pathlib.Path, Path to a target filename.
                         Supports strings, ``pathlib.Path``, or any object implementing the ``__fspath__`` protocol.
                         May contain a ``*`` to denote many filenames.
                     """,
             ),
             "key":
             Field(
                 String,
                 is_required=True,
                 description="""
                         Datapath within the files.
                         May contain a ``*`` to denote many locations.
                     """,
             ),
             "compute":
             Field(
                 Bool,
                 is_required=False,
                 description="""
                         Whether or not to execute immediately.
                         If False then this returns a ``dask.Delayed`` value.
                     """,
             ),
             "scheduler":
             Field(
                 String,
                 is_required=False,
                 description=
                 "The scheduler to use, like 'threads' or 'processes'.",
             ),
         }),
         "json":
         Permissive(
             {
                 "path":
                 Field(
                     Any,
                     is_required=True,
                     description="""
                         str or list, Location to write to.
                         If a string, and there are more than one partitions in df,
                         should include a glob character to expand into a set of file names,
                         or provide a ``name_function=`` parameter.
                         Supports protocol specifications such as ``'s3://'``.
                     """,
                 ),
                 "encoding":
                 Field(
                     String,
                     is_required=False,
                     description=
                     "default is 'utf-8', The text encoding to implement, e.g., 'utf-8'.",
                 ),
                 "errors":
                 Field(
                     String,
                     is_required=False,
                     description=
                     "default is 'strict', how to respond to errors in the conversion (see ``str.encode()``).",
                 ),
                 "storage_options":
                 Field(
                     Permissive(),
                     is_required=False,
                     description=
                     "Passed to backend file-system implementation",
                 ),
                 "compute":
                 Field(
                     Bool,
                     is_required=False,
                     description="""
                         If true, immediately executes.
                         If False, returns a set of delayed objects, which can be computed at a later time.
                     """,
                 ),
                 "compute_kwargs":
                 Field(
                     Permissive(),
                     is_required=False,
                     description=
                     "Options to be passed in to the compute method",
                 ),
                 "compression":
                 Field(
                     String,
                     is_required=False,
                     description="String like 'gzip' or 'xz'.",
                 ),
             }, ),
         "sql":
         Permissive(
             {
                 "name":
                 Field(
                     String,
                     is_required=True,
                     description="Name of SQL table",
                 ),
                 "uri":
                 Field(
                     String,
                     is_required=True,
                     description=
                     "Full sqlalchemy URI for the database connection",
                 ),
                 "schema":
                 Field(
                     String,
                     is_required=False,
                     description=
                     "Specify the schema (if database flavor supports this). If None, use default schema.",
                 ),
                 "if_exists":
                 Field(
                     String,
                     is_required=False,
                     description="""
                         {'fail', 'replace', 'append'}, default 'fail'"
                         How to behave if the table already exists.
                         * fail: Raise a ValueError.
                         * replace: Drop the table before inserting new values.
                         * append: Insert new values to the existing table.
                     """,
                 ),
                 "index":
                 Field(
                     Bool,
                     is_required=False,
                     description="""
                         default is True, Write DataFrame index as a column.
                         Uses `index_label` as the column name in the table.
                     """,
                 ),
                 "index_label":
                 Field(
                     Any,
                     is_required=False,
                     description="""
                         str or sequence, default None Column label for index column(s).
                         If None is given (default) and `index` is True, then the index names are used.
                         A sequence should be given if the DataFrame uses MultiIndex.
                     """,
                 ),
                 "chunksize":
                 Field(
                     Int,
                     is_required=False,
                     description="""
                         Specify the number of rows in each batch to be written at a time.
                         By default, all rows will be written at once.
                     """,
                 ),
                 "dtype":
                 Field(
                     Any,
                     is_required=False,
                     description="""
                         dict or scalar, Specifying the datatype for columns.
                         If a dictionary is used, the keys should be the column names
                         and the values should be the SQLAlchemy types or strings for the sqlite3 legacy mode.
                         If a scalar is provided, it will be applied to all columns.
                     """,
                 ),
                 "method":
                 Field(
                     String,
                     is_required=False,
                     description="""
                         {None, 'multi', callable}, default None
                         Controls the SQL insertion clause used:
                         * None : Uses standard SQL ``INSERT`` clause (one per row).
                         * 'multi': Pass multiple values in a single ``INSERT`` clause.
                         * callable with signature ``(pd_table, conn, keys, data_iter)``.
                         Details and a sample callable implementation can be found in the
                         section :ref:`insert method <io.sql.method>`.
                     """,
                 ),
                 "compute":
                 Field(
                     Bool,
                     is_required=False,
                     description="""
                         default is True, When true, call dask.compute and perform the load into SQL;
                         otherwise, return a Dask object (or array of per-block objects when parallel=True).
                     """,
                 ),
                 "parallel":
                 Field(
                     Bool,
                     is_required=False,
                     description="""
                         default is False, When true, have each block append itself to the DB table concurrently.
                         This can result in DB rows being in a different order than the source DataFrame's corresponding rows.
                         When false, load each block into the SQL DB in sequence.
                     """,
                 ),
             }, ),
     }, ))
Exemplo n.º 13
0
def _get_default_executor_requirements(executor_config):
    return multiple_process_executor_requirements(
    ) if "multiprocess" in executor_config else []


@executor(
    name="multi_or_in_process_executor",
    config_schema=Field(
        Selector(
            {
                "multiprocess": {
                    "max_concurrent":
                    Field(Int, is_required=False, default_value=0),
                    "retries":
                    get_retries_config(),
                },
                "in_process": {
                    "retries": get_retries_config(),
                    "marker_to_close": Field(str, is_required=False),
                },
            }, ),
        default_value={"multiprocess": {}},
    ),
    requirements=_get_default_executor_requirements,
)
def multi_or_in_process_executor(init_context):
    """The default executor for a job.

    This is the executor available by default on a :py:class:`JobDefinition`
    that does not provide custom executors. This executor has a multiprocessing-enabled mode, and a
Exemplo n.º 14
0
CONSTRAINT_BLACKLIST = {ColumnTypeConstraint}


def dict_without_keys(ddict, *keys):
    return {key: value for key, value in ddict.items() if key not in set(keys)}


@output_selector_schema(
    Selector(
        {
            'csv': {
                'path': Path,
                'sep': Field(String, is_required=False, default_value=','),
            },
            'parquet': {
                'path': Path
            },
            'table': {
                'path': Path
            },
        }, ))
def dataframe_output_schema(_context, file_type, file_options, pandas_df):
    check.str_param(file_type, 'file_type')
    check.dict_param(file_options, 'file_options')
    check.inst_param(pandas_df, 'pandas_df', pd.DataFrame)

    if file_type == 'csv':
        path = file_options['path']
        pandas_df.to_csv(path,
                         index=False,
Exemplo n.º 15
0
from dagster.config.field_utils import Selector
from dagster.utils.backcompat import canonicalize_backcompat_args

CONSTRAINT_BLACKLIST = {ColumnDTypeFnConstraint, ColumnDTypeInSetConstraint}


def dict_without_keys(ddict, *keys):
    return {key: value for key, value in ddict.items() if key not in set(keys)}


@dagster_type_materializer(
    Selector(
        {
            'csv': {
                'path': StringSource,
                'sep': Field(StringSource, is_required=False, default_value=','),
            },
            'parquet': {'path': StringSource},
            'table': {'path': StringSource},
        },
    )
)
def dataframe_materializer(_context, config, pandas_df):
    check.inst_param(pandas_df, 'pandas_df', pd.DataFrame)
    file_type, file_options = list(config.items())[0]

    if file_type == 'csv':
        path = file_options['path']
        pandas_df.to_csv(path, index=False, **dict_without_keys(file_options, 'path'))
    elif file_type == 'parquet':
        pandas_df.to_parquet(file_options['path'])
    elif file_type == 'table':
Exemplo n.º 16
0
 Selector(
     {
         "csv": Permissive(
             {
                 "path": Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "compression": Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file.",
                 ),
                 "sep": Field(
                     String,
                     is_required=False,
                     description="sets a single character as a separator for each field and value. If None is set, it uses the default value, ``,``.",
                 ),
                 "quote": Field(
                     String,
                     is_required=False,
                     description="""sets a single character used for escaping quoted values where the separator can be part of the value. If None is set, it uses the default value, ``"``. If an empty string is set, it uses ``u0000`` (null character).""",
                 ),
                 "escape": Field(
                     String,
                     is_required=False,
                     description="sets a single character used for escaping quotes inside an already quoted value. If None is set, it uses the default value, ``\\``.",
                 ),
                 "escapeQuotes": Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether values containing quotes should always be enclosed in quotes. If None is set, it uses the default value ``true``, escaping all values containing a quote character.",
                 ),
                 "quoteAll": Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether all values should always be enclosed in quotes. If None is set, it uses the default value ``false``, only escaping values containing a quote character.",
                 ),
                 "header": Field(
                     Bool,
                     is_required=False,
                     description="writes the names of columns as the first line. If None is set, it uses the default value, ``false``.",
                 ),
                 "nullValue": Field(
                     String,
                     is_required=False,
                     description="sets the string representation of a null value. If None is set, it uses the default value, empty string.",
                 ),
                 "dateFormat": Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a date format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to date type. If None is set, it uses the default value, ``yyyy-MM-dd``.",
                 ),
                 "timestampFormat": Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a timestamp format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to timestamp type. If None is set, it uses the default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.",
                 ),
                 "ignoreLeadingWhiteSpace": Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether or not leading whitespaces from values being written should be skipped. If None is set, it uses the default value, ``true``.",
                 ),
                 "ignoreTrailingWhiteSpace": Field(
                     Bool,
                     is_required=False,
                     description="a flag indicating whether or not trailing whitespaces from values being written should be skipped. If None is set, it uses the default value, ``true``.",
                 ),
                 "charToEscapeQuoteEscaping": Field(
                     String,
                     is_required=False,
                     description="sets a single character used for escaping the escape for the quote character. If None is set, the default value is escape character when escape and quote characters are different, ``\0`` otherwise..",
                 ),
                 "encoding": Field(
                     String,
                     is_required=False,
                     description="sets the encoding (charset) of saved csv files. If None is set, the default UTF-8 charset will be used.",
                 ),
                 "emptyValue": Field(
                     String,
                     is_required=False,
                     description="sets the string representation of an empty value. If None is set, it uses the default value, ``"
                     "``.",
                 ),
             }
         ),
         "parquet": Permissive(
             {
                 "path": Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "partitionBy": Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 "compression": Field(
                     WriteCompressionParquetOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``spark.sql.parquet.compression.codec``. If None is set, it uses the value specified in ``spark.sql.parquet.compression.codec``.",
                 ),
             }
         ),
         "json": Permissive(
             {
                 "path": Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "compression": Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file.",
                 ),
                 "dateFormat": Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a date format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to date type. If None is set, it uses the default value, ``yyyy-MM-dd``.",
                 ),
                 "timestampFormat": Field(
                     String,
                     is_required=False,
                     description="sets the string that indicates a timestamp format. Custom date formats follow the formats at ``java.text.SimpleDateFormat``. This applies to timestamp type. If None is set, it uses the default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``.",
                 ),
                 "encoding": Field(
                     String,
                     is_required=False,
                     description="sets the encoding (charset) of saved csv files. If None is set, the default UTF-8 charset will be used.",
                 ),
                 "lineSep": Field(
                     String,
                     is_required=False,
                     description="defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``.",
                 ),
             }
         ),
         "jdbc": Permissive(
             {
                 "url": Field(
                     String,
                     is_required=True,
                     description="a JDBC URL of the form ``jdbc:subprotocol:subname``.",
                 ),
                 "table": Field(
                     String,
                     is_required=True,
                     description="Name of the table in the external database.",
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "properties": Field(
                     Permissive(),
                     is_required=False,
                     description="""a dictionary of JDBC database connection arguments. Normally at least properties "user" and "password" with their corresponding values. For example { 'user' : 'SYSTEM', 'password' : 'mypassword' }.""",
                 ),
             }
         ),
         "orc": Permissive(
             {
                 "path": Field(
                     String,
                     is_required=True,
                     description="the path in any Hadoop supported file system.",
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "partitionBy": Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 "compression": Field(
                     WriteCompressionOrcOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``orc.compress`` and ``spark.sql.orc.compression.codec``. If None is set, it uses the value specified in ``spark.sql.orc.compression.codec``.",
                 ),
             }
         ),
         "saveAsTable": Permissive(
             {
                 "name": Field(String, is_required=True, description="the table name."),
                 "format": Field(
                     String, is_required=False, description="the format used to save."
                 ),
                 "mode": Field(
                     WriteModeOptions,
                     is_required=False,
                     description="specifies the behavior of the save operation when data already exists.",
                 ),
                 "partitionBy": Field(
                     String, is_required=False, description="names of partitioning columns."
                 ),
                 "options": Field(
                     Permissive(), is_required=False, description="all other string options."
                 ),
             }
         ),
         "text": Permissive(
             {
                 "path": Field(
                     String,
                     is_required=True,
                     description="he path in any Hadoop supported file system.",
                 ),
                 "compression": Field(
                     WriteCompressionTextOptions,
                     is_required=False,
                     description="compression codec to use when saving to file. This will override ``orc.compress`` and ``spark.sql.orc.compression.codec``. If None is set, it uses the value specified in ``spark.sql.orc.compression.codec``.",
                 ),
                 "lineSep": Field(
                     String,
                     is_required=False,
                     description="defines the line separator that should be used for writing. If None is set, it uses the default value, ``\\n``.",
                 ),
             }
         ),
         "other": Permissive(),
     }
 )
Exemplo n.º 17
0
 def __init__(self):
     super(StringSourceType, self).__init__(
         scalar_type=ConfigStringInstance,
         non_scalar_type=Selector({'env': str}),
         _key='StringSourceType',
     )