def test_subset_of_columns(): @outputs(a=Types.Schema([('a', Types.Integer), ('b', Types.String)])) @python_task() def source(wf_params, a): out = Types.Schema([('a', Types.Integer), ('b', Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e'] })) a.set(out) @inputs(a=Types.Schema([('a', Types.Integer)])) @python_task() def sink(wf_params, a): with a as reader: df = reader.read(concat=True) assert len(df.columns.values) == 1 assert df['a'].tolist() == [1, 2, 3, 4, 5] with a as reader: df = reader.read(truncate_extra_columns=False) assert df.columns.values.tolist() == ['a', 'b'] assert df['a'].tolist() == [1, 2, 3, 4, 5] assert df['b'].tolist() == ['a', 'b', 'c', 'd', 'e'] o = source.unit_test() sink.unit_test(**o)
def generate_queries(wf_params, hive_results): q1 = "SELECT 1" q2 = "SELECT 'two'" schema_1, formatted_query_1 = Types.Schema().create_from_hive_query(select_query=q1) schema_2, formatted_query_2 = Types.Schema().create_from_hive_query(select_query=q2) hive_results.set([schema_1, schema_2]) return [formatted_query_1, formatted_query_2]
def test_generic_schema(): @inputs(a=Types.Schema()) @outputs(b=Types.Schema()) @python_task def copy_task(wf_params, a, b): out = Types.Schema()() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out) # Test generic copy and pass through a = Types.Schema()() with a as w: w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs['b'] as r: df = r.read() assert list(df['a']) == [1, 2, 3] assert list(df['b']) == [4.0, 5.0, 6.0] df = r.read() assert list(df['a']) == [3, 2, 1] assert list(df['b']) == [6.0, 5.0, 4.0] assert r.read() is None # Test typed copy and pass through a = Types.Schema([('a', Types.Integer), ('b', Types.Float)])() with a as w: w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs['b'] as r: df = r.read() assert list(df['a']) == [1, 2, 3] assert list(df['b']) == [4.0, 5.0, 6.0] df = r.read() assert list(df['a']) == [3, 2, 1] assert list(df['b']) == [6.0, 5.0, 4.0] assert r.read() is None
def copy_task(wf_params, a, b): out = Types.Schema()() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out)
def write_special_types(wf_params, a, b, c, d, e): blob = Types.Blob() with blob as w: w.write("hello I'm a blob".encode('utf-8')) csv = Types.CSV() with csv as w: w.write("hello,i,iz,blob") mpcsv = Types.MultiPartCSV() with mpcsv.create_part('000000') as w: w.write("hello,i,iz,blob") with mpcsv.create_part('000001') as w: w.write("hello,i,iz,blob2") mpblob = Types.MultiPartBlob() with mpblob.create_part('000000') as w: w.write("hello I'm a mp blob".encode('utf-8')) with mpblob.create_part('000001') as w: w.write("hello I'm a mp blob too".encode('utf-8')) schema = Types.Schema([('a', Types.Integer), ('b', Types.Integer)])() with schema as w: w.write(_pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4, 5, 6]})) w.write(_pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6, 5, 4]})) a.set(blob) b.set(csv) c.set(mpcsv) d.set(mpblob) e.set(schema)
def test_no_output_set(): @outputs(a=Types.Schema()) @python_task() def null_set(wf_params, a): pass assert null_set.unit_test()['a'] is None
def copy_task(wf_params, a, b): out = Types.Schema([('a', Types.Integer), ('b', Types.Float)])() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out)
class StructuredSagemakerXGBoostHPO(object): # Input parameters static_hyperparameters = Input( Types.Generic, help= "A list of the static hyperparameters to pass to the training jobs.", default=example_hyperparams, ) train_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for training.", ) train_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for train_data.", ) validation_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for validation.", ) validation_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for validation_data.", ) sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data, y_train=train_target, x_test=validation_data, y_test=validation_target) # Node definitions train_node = xgtrainer_task( static_hyperparameters=static_hyperparameters, train=sagemaker_transform.outputs.train, validation=sagemaker_transform.outputs.validation, ) untar = untar_xgboost(model_tar=train_node.outputs.model, ) # Outputs model = Output(untar.outputs.model, sdk_type=Types.Blob)
def source(wf_params, a): out = Types.Schema([('a', Types.Integer), ('b', Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e'] })) a.set(out)
def source(wf_params, a): out = Types.Schema([("a", Types.Integer), ("b", Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ "a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"] })) a.set(out)
def test_create_from_hive_query(): s, q = Types.Schema().create_from_hive_query( "SELECT * FROM table", known_location="s3://somewhere/") assert s.mode == "wb" assert s.local_path is None assert s.remote_location == "s3://somewhere/" assert "SELECT * FROM table" in q assert s.remote_location in q
def test_typed_schema(): @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Float)])) @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Float)])) @python_task def copy_task(wf_params, a, b): out = Types.Schema([("a", Types.Integer), ("b", Types.Float)])() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out) # Test typed copy and pass through a = Types.Schema([("a", Types.Integer), ("b", Types.Float)])() with a as w: w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs["b"] as r: df = r.read() assert list(df["a"]) == [1, 2, 3] assert list(df["b"]) == [4.0, 5.0, 6.0] df = r.read() assert list(df["a"]) == [3, 2, 1] assert list(df["b"]) == [6.0, 5.0, 4.0] assert r.read() is None # Test untyped failure a = Types.Schema()() with a as w: w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]})) with pytest.raises(_user_exceptions.FlyteTypeException): copy_task.unit_test(a=a)
def test_bad_column_types(): with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.Blob)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.MultiPartBlob)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.MultiPartCSV)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.CSV)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.Schema())])
def add_one(wf_params, a, b): b.set(a + 1) @inputs(a=Types.Integer) @outputs(b=Types.Integer) @python_task(cache=True, cache_version='1') def subtract_one(wf_params, a, b): b.set(a - 1) @outputs(a=Types.Blob, b=Types.CSV, c=Types.MultiPartCSV, d=Types.MultiPartBlob, e=Types.Schema([('a', Types.Integer), ('b', Types.Integer)])) @python_task def write_special_types(wf_params, a, b, c, d, e): blob = Types.Blob() with blob as w: w.write("hello I'm a blob".encode('utf-8')) csv = Types.CSV() with csv as w: w.write("hello,i,iz,blob") mpcsv = Types.MultiPartCSV() with mpcsv.create_part('000000') as w: w.write("hello,i,iz,blob") with mpcsv.create_part('000001') as w: w.write("hello,i,iz,blob2")
# 8. Age (years) # 9. Class variable (0 or 1) # Example Row: 6,148,72,35,0,33.6,0.627,50,1 TYPED_COLUMNS = [ ('#preg', Types.Integer), ('pgc_2h', Types.Integer), ('diastolic_bp', Types.Integer), ('tricep_skin_fold_mm', Types.Integer), ('serum_insulin_2h', Types.Integer), ('bmi', Types.Float), ('diabetes_pedigree', Types.Float), ('age', Types.Integer), ('class', Types.Integer), ] # the input dataset schema DATASET_SCHEMA = Types.Schema(TYPED_COLUMNS) # the first 8 columns are features FEATURES_SCHEMA = Types.Schema(TYPED_COLUMNS[:8]) # the last column is the class CLASSES_SCHEMA = Types.Schema([TYPED_COLUMNS[-1]]) class XGBoostModelHyperparams(object): """ These are the xgboost hyper parameters available in scikit-learn library. """ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, objective="binary:logistic",
b.set(a + 1) @inputs(a=Types.Integer) @outputs(b=Types.Integer) @python_task(cache=True, cache_version="1") def subtract_one(wf_params, a, b): b.set(a - 1) @outputs( a=Types.Blob, b=Types.CSV, c=Types.MultiPartCSV, d=Types.MultiPartBlob, e=Types.Schema([("a", Types.Integer), ("b", Types.Integer)]), ) @python_task def write_special_types(wf_params, a, b, c, d, e): blob = Types.Blob() with blob as w: w.write("hello I'm a blob".encode("utf-8")) csv = Types.CSV() with csv as w: w.write("hello,i,iz,blob") mpcsv = Types.MultiPartCSV() with mpcsv.create_part("000000") as w: w.write("hello,i,iz,blob") with mpcsv.create_part("000001") as w:
cache_version="1", cluster_label=_six.text_type("cluster_label"), tags=[_six.text_type("tag1")], ) def sample_qubole_hive_task(wf_params, in1): return _six.text_type("select ") + _six.text_type(in1) def test_hive_task(): assert isinstance(sample_hive_task, _sdk_runnable.SdkRunnableTask) assert isinstance(sample_hive_task, _hive_task.SdkHiveTask) sample_hive_task.unit_test(in1=5) @outputs(hive_results=[Types.Schema()]) @qubole_hive_task def two_queries(wf_params, hive_results): q1 = "SELECT 1" q2 = "SELECT 'two'" schema_1, formatted_query_1 = Schema.create_from_hive_query( select_query=q1) schema_2, formatted_query_2 = Schema.create_from_hive_query( select_query=q2) hive_results.set([schema_1, schema_2]) return [formatted_query_1, formatted_query_2] def test_interface_setup(): outs = two_queries.interface.outputs
def test_typed_schema(): @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Integer)])) @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Integer)])) @python_task def fake_task(wf_params, a, b): pass
def test_generic_schema(): @inputs(a=Types.Schema()) @outputs(b=Types.Schema()) @python_task def fake_task(wf_params, a, b): pass
def test_bad_definition(): with pytest.raises(_user_exceptions.FlyteValueException): Types.Schema([])
"../../../common/configs/local.config", ), internal_overrides={ "image": "myflyteimage:v123", "project": "myflyteproject", "domain": "development" }, ): s = t.serialize() assert isinstance(s, _admin_task_pb2.TaskSpec) assert s.template.id.name == "tests.flytekit.unit.common_tests.tasks.test_task.my_task" assert s.template.container.image == "myflyteimage:v123" schema = Types.Schema([("a", Types.String), ("b", Types.Integer)]) def test_task_produce_deterministic_version(): containerless_task = SdkPrestoTask( task_inputs=inputs(ds=Types.String, rg=Types.String), statement= "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10", output_schema=schema, routing_group="{{ .Inputs.rg }}", ) identical_containerless_task = SdkPrestoTask( task_inputs=inputs(ds=Types.String, rg=Types.String), statement= "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10", output_schema=schema,
def read_and_merge(first, second): """ Sagemaker likes the target to be in column 1. This method takes the y and the x and just places the dataframes next to each other, yielding a common dataframe """ with first as r: first_df = r.read() with second as r: second_df = r.read() if len(first_df) != len(second_df): raise Exception( "trying to merge to data frames which are not equal in length") return pd.concat([first_df, second_df], axis=1) @inputs(x_train=Types.Schema(), x_test=Types.Schema(), y_train=Types.Schema(), y_test=Types.Schema()) @outputs(train=Types.MultiPartCSV, validation=Types.MultiPartCSV) @python_task(cache_version='3.0', cache=True, memory_limit="500Mi") def convert_to_sagemaker_csv(ctx, x_train, y_train, x_test, y_test, train, validation): _train = read_and_merge(y_train, x_train) _validate = read_and_merge(y_test, x_test) with utils.AutoDeletingTempDir("train") as t: f = t.get_named_tempfile("train.csv") _train.to_csv(f, header=False, index=False) train.set(t.name)