class MultiRegionHousePricePredictionModelTrainer(object): """ This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset """ regions = Input(Types.List(Types.String), default=["SFO", "SEA", "DEN"], help="Regions for where to train the model.") seed = Input(Types.Integer, default=7, help="Seed to use for splitting.") num_houses_per_region = Input( Types.Integer, default=1000, help="Number of houses to generate data for in each region") # the actual algorithm split = generate_and_split_data_multiloc( locations=regions, number_of_houses_per_location=num_houses_per_region, seed=seed) fit_task = parallel_fit(multi_train=split.outputs.train) predicted = parallel_predict(multi_models=fit_task.outputs.multi_models, multi_test=split.outputs.test) # Outputs: joblib seralized models per region and accuracy of the model per region # Note we should make this into a map, but for demo we will output a simple list models = Output(fit_task.outputs.multi_models, sdk_type=Types.List(Types.Blob)) accuracies = Output(predicted.outputs.accuracies, sdk_type=Types.List(Types.Float))
def write_special_types(wf_params, a, b, c, d, e): blob = Types.Blob() with blob as w: w.write("hello I'm a blob".encode('utf-8')) csv = Types.CSV() with csv as w: w.write("hello,i,iz,blob") mpcsv = Types.MultiPartCSV() with mpcsv.create_part('000000') as w: w.write("hello,i,iz,blob") with mpcsv.create_part('000001') as w: w.write("hello,i,iz,blob2") mpblob = Types.MultiPartBlob() with mpblob.create_part('000000') as w: w.write("hello I'm a mp blob".encode('utf-8')) with mpblob.create_part('000001') as w: w.write("hello I'm a mp blob too".encode('utf-8')) schema = Types.Schema([('a', Types.Integer), ('b', Types.Integer)])() with schema as w: w.write(_pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4, 5, 6]})) w.write(_pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6, 5, 4]})) a.set(blob) b.set(csv) c.set(mpcsv) d.set(mpblob) e.set(schema)
def test_subset_of_columns(): @outputs(a=Types.Schema([('a', Types.Integer), ('b', Types.String)])) @python_task() def source(wf_params, a): out = Types.Schema([('a', Types.Integer), ('b', Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e'] })) a.set(out) @inputs(a=Types.Schema([('a', Types.Integer)])) @python_task() def sink(wf_params, a): with a as reader: df = reader.read(concat=True) assert len(df.columns.values) == 1 assert df['a'].tolist() == [1, 2, 3, 4, 5] with a as reader: df = reader.read(truncate_extra_columns=False) assert df.columns.values.tolist() == ['a', 'b'] assert df['a'].tolist() == [1, 2, 3, 4, 5] assert df['b'].tolist() == ['a', 'b', 'c', 'd', 'e'] o = source.unit_test() sink.unit_test(**o)
def generate_queries(wf_params, hive_results): q1 = "SELECT 1" q2 = "SELECT 'two'" schema_1, formatted_query_1 = Types.Schema().create_from_hive_query(select_query=q1) schema_2, formatted_query_2 = Types.Schema().create_from_hive_query(select_query=q2) hive_results.set([schema_1, schema_2]) return [formatted_query_1, formatted_query_2]
def test_generic_schema(): @inputs(a=Types.Schema()) @outputs(b=Types.Schema()) @python_task def copy_task(wf_params, a, b): out = Types.Schema()() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out) # Test generic copy and pass through a = Types.Schema()() with a as w: w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs['b'] as r: df = r.read() assert list(df['a']) == [1, 2, 3] assert list(df['b']) == [4.0, 5.0, 6.0] df = r.read() assert list(df['a']) == [3, 2, 1] assert list(df['b']) == [6.0, 5.0, 4.0] assert r.read() is None # Test typed copy and pass through a = Types.Schema([('a', Types.Integer), ('b', Types.Float)])() with a as w: w.write(pd.DataFrame.from_dict({'a': [1, 2, 3], 'b': [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({'a': [3, 2, 1], 'b': [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs['b'] as r: df = r.read() assert list(df['a']) == [1, 2, 3] assert list(df['b']) == [4.0, 5.0, 6.0] df = r.read() assert list(df['a']) == [3, 2, 1] assert list(df['b']) == [6.0, 5.0, 4.0] assert r.read() is None
def test_multipartblob_passing(): @inputs(a=Types.MultiPartBlob) @outputs(b=Types.MultiPartBlob) @python_task def test_pass(wf_params, a, b): b.set(a) b = Types.MultiPartBlob() with b.create_part("0") as w: w.write("Hello world".encode("utf-8")) with b.create_part("1") as w: w.write("Hello world2".encode("utf-8")) out = test_pass.unit_test(a=b) assert len(out) == 1 with out["b"] as r: assert len(r) == 2 assert r[0].read().decode("utf-8") == "Hello world" assert r[1].read().decode("utf-8") == "Hello world2" out = test_pass.unit_test(a=out["b"]) assert len(out) == 1 with out["b"] as r: assert len(r) == 2 assert r[0].read().decode("utf-8") == "Hello world" assert r[1].read().decode("utf-8") == "Hello world2"
def test_no_output_set(): @outputs(a=Types.Schema()) @python_task() def null_set(wf_params, a): pass assert null_set.unit_test()['a'] is None
def copy_task(wf_params, a, b): out = Types.Schema()() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out)
def copy_task(wf_params, a, b): out = Types.Schema([('a', Types.Integer), ('b', Types.Float)])() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out)
def test_write(wf_params, a): b = Types.MultiPartCSV() with b.create_part("0") as w: w.write("Hello,world,1") with b.create_part("1") as w: w.write("Hello,world,2") a.set(b)
def test_write(wf_params, a): b = Types.MultiPartBlob() with b.create_part("0") as w: w.write("Hello world".encode("utf-8")) with b.create_part("1") as w: w.write("Hello world2".encode("utf-8")) a.set(b)
class StructuredSagemakerXGBoostHPO(object): # Input parameters static_hyperparameters = Input( Types.Generic, help= "A list of the static hyperparameters to pass to the training jobs.", default=example_hyperparams, ) train_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for training.", ) train_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for train_data.", ) validation_data = Input( Types.Schema(), help= "A Columnar schema that contains all the features used for validation.", ) validation_target = Input( Types.Schema(), help= "A Columnar schema that contains all the labeled results for validation_data.", ) sagemaker_transform = convert_to_sagemaker_csv(x_train=train_data, y_train=train_target, x_test=validation_data, y_test=validation_target) # Node definitions train_node = xgtrainer_task( static_hyperparameters=static_hyperparameters, train=sagemaker_transform.outputs.train, validation=sagemaker_transform.outputs.validation, ) untar = untar_xgboost(model_tar=train_node.outputs.model, ) # Outputs model = Output(untar.outputs.model, sdk_type=Types.Blob)
def test_create_from_hive_query(): s, q = Types.Schema().create_from_hive_query( "SELECT * FROM table", known_location="s3://somewhere/") assert s.mode == "wb" assert s.local_path is None assert s.remote_location == "s3://somewhere/" assert "SELECT * FROM table" in q assert s.remote_location in q
def source(wf_params, a): out = Types.Schema([("a", Types.Integer), ("b", Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ "a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"] })) a.set(out)
def source(wf_params, a): out = Types.Schema([('a', Types.Integer), ('b', Types.String)])() with out as writer: writer.write( pd.DataFrame.from_dict({ 'a': [1, 2, 3, 4, 5], 'b': ['a', 'b', 'c', 'd', 'e'] })) a.set(out)
def test_typed_schema(): @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Float)])) @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Float)])) @python_task def copy_task(wf_params, a, b): out = Types.Schema([("a", Types.Integer), ("b", Types.Float)])() with a as r: with out as w: for df in r.iter_chunks(): w.write(df) b.set(out) # Test typed copy and pass through a = Types.Schema([("a", Types.Integer), ("b", Types.Float)])() with a as w: w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]})) outs = copy_task.unit_test(a=a) with outs["b"] as r: df = r.read() assert list(df["a"]) == [1, 2, 3] assert list(df["b"]) == [4.0, 5.0, 6.0] df = r.read() assert list(df["a"]) == [3, 2, 1] assert list(df["b"]) == [6.0, 5.0, 4.0] assert r.read() is None # Test untyped failure a = Types.Schema()() with a as w: w.write(pd.DataFrame.from_dict({"a": [1, 2, 3], "b": [4.0, 5.0, 6.0]})) w.write(pd.DataFrame.from_dict({"a": [3, 2, 1], "b": [6.0, 5.0, 4.0]})) with pytest.raises(_user_exceptions.FlyteTypeException): copy_task.unit_test(a=a)
def test_bad_column_types(): with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.Blob)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.MultiPartBlob)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.MultiPartCSV)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.CSV)]) with pytest.raises(_user_exceptions.FlyteTypeException): Types.Schema([("a", Types.Schema())])
def collect_blobs(folder_path): onlyfiles = [ join(folder_path, f) for f in sorted(listdir(folder_path)) if isfile(join(folder_path, f)) ] my_blobs = [] file_names = [] for local_filepath in onlyfiles: my_blob = Types.Blob() with my_blob as fileobj: with open(local_filepath, mode="rb") as file: # b is important -> binary fileobj.write(file.read()) my_blobs.append(my_blob) file_names.append(basename(local_filepath)) return my_blobs, file_names
def test_blob_passing(): @inputs(a=Types.Blob) @outputs(b=Types.Blob) @python_task def test_pass(wf_params, a, b): b.set(a) b = Types.Blob() with b as w: w.write("Hello world".encode("utf-8")) out = test_pass.unit_test(a=b) assert len(out) == 1 with out["b"] as r: assert r.read().decode("utf-8") == "Hello world" out = test_pass.unit_test(a=out["b"]) assert len(out) == 1 with out["b"] as r: assert r.read().decode("utf-8") == "Hello world"
def test_bad_definition(): with pytest.raises(_user_exceptions.FlyteValueException): Types.Schema([])
def test_generic_schema(): @inputs(a=Types.Schema()) @outputs(b=Types.Schema()) @python_task def fake_task(wf_params, a, b): pass
cache_version="1", cluster_label=_six.text_type("cluster_label"), tags=[_six.text_type("tag1")], ) def sample_qubole_hive_task(wf_params, in1): return _six.text_type("select ") + _six.text_type(in1) def test_hive_task(): assert isinstance(sample_hive_task, _sdk_runnable.SdkRunnableTask) assert isinstance(sample_hive_task, _hive_task.SdkHiveTask) sample_hive_task.unit_test(in1=5) @outputs(hive_results=[Types.Schema()]) @qubole_hive_task def two_queries(wf_params, hive_results): q1 = "SELECT 1" q2 = "SELECT 'two'" schema_1, formatted_query_1 = Schema.create_from_hive_query( select_query=q1) schema_2, formatted_query_2 = Schema.create_from_hive_query( select_query=q2) hive_results.set([schema_1, schema_2]) return [formatted_query_1, formatted_query_2] def test_interface_setup(): outs = two_queries.interface.outputs
def test_write(wf_params, a): b = Types.Blob() with b as w: w.write("Hello world".encode("utf-8")) a.set(b)
from flytekit.sdk.tasks import inputs, outputs, python_task from flytekit.sdk.types import Types from flytekit.sdk.workflow import workflow_class, Input, Output import json @inputs(custom=Types.Generic) @outputs(counts=Types.Generic, replicated=Types.List(Types.Generic)) @python_task def generic_type_task(wf_params, custom, counts, replicated): """ Go through each of the values of the input and if it's a str, count the length Also, create a replicated list of the Generic """ wf_params.logging.info("Running custom object task") results = {} for k, v in custom.items(): if type(v) == str: results[k] = len(v) else: results[k] = v counts.set(results) replicated.set([custom, custom]) @inputs(replicated=Types.List(Types.Generic)) @outputs(str_repr=Types.String) @python_task def generic_to_json(wf_params, replicated, str_repr): """
def add_one(wf_params, a, b): b.set(a + 1) @inputs(a=Types.Integer) @outputs(b=Types.Integer) @python_task(cache=True, cache_version='1') def subtract_one(wf_params, a, b): b.set(a - 1) @outputs(a=Types.Blob, b=Types.CSV, c=Types.MultiPartCSV, d=Types.MultiPartBlob, e=Types.Schema([('a', Types.Integer), ('b', Types.Integer)])) @python_task def write_special_types(wf_params, a, b, c, d, e): blob = Types.Blob() with blob as w: w.write("hello I'm a blob".encode('utf-8')) csv = Types.CSV() with csv as w: w.write("hello,i,iz,blob") mpcsv = Types.MultiPartCSV() with mpcsv.create_part('000000') as w: w.write("hello,i,iz,blob") with mpcsv.create_part('000001') as w: w.write("hello,i,iz,blob2")
import os from flytekit.sdk.tasks import python_task, inputs, outputs, dynamic_task from flytekit.sdk.types import Types from flytekit.sdk.workflow import workflow_class, Input, Output from demo.house_price_predictor import generate_data, save_to_file, save_to_dir, fit, predict @inputs(locations=Types.List(Types.String), number_of_houses_per_location=Types.Integer, seed=Types.Integer) @outputs(train=Types.List(Types.MultiPartCSV), val=Types.List(Types.MultiPartCSV), test=Types.List(Types.CSV)) @python_task(cache=True, cache_version="0.1", memory_request="200Mi") def generate_and_split_data_multiloc(wf_params, locations, number_of_houses_per_location, seed, train, val, test): train_sets = [] val_sets = [] test_sets = [] for loc in locations: _train, _val, _test = generate_data(loc, number_of_houses_per_location, seed) dir = "multi_data" os.makedirs(dir, exist_ok=True) train_sets.append(save_to_dir(dir, "train", _train)) val_sets.append(save_to_dir(dir, "val", _val)) test_sets.append(save_to_file(dir, "test", _test)) train.set(train_sets)
# We know we are writing just one file, so we will just read the one file df = pd.read_csv(os.path.join(train.local_path, files[0]), header=None) y = df[df.columns[0]] x = df[df.columns[1:]] # fit model no training data m = XGBClassifier() m.fit(x, y) # TODO model Blob should be a file like object fname = "model.joblib.dat" joblib.dump(m, fname) model.set(fname) @inputs(test=Types.CSV, model_ser=Types.Blob) # TODO: format=".joblib.dat")) @outputs(predictions=Types.List(Types.Float), accuracy=Types.Float) @python_task(cache_version='1.0', cache=True, memory_request="200Mi") def predict(ctx, test, model_ser, predictions, accuracy): """ Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns predictions. """ # Load model model_ser.download() model = joblib.load(model_ser.local_path) # Load test data test.download() test_df = pd.read_csv(test.local_path, header=None) x_df = test_df[test_df.columns[1:]] y_df = test_df[test_df.columns[0]] y_pred = model.predict(x_df)
def test_typed_schema(): @inputs(a=Types.Schema([("a", Types.Integer), ("b", Types.Integer)])) @outputs(b=Types.Schema([("a", Types.Integer), ("b", Types.Integer)])) @python_task def fake_task(wf_params, a, b): pass
"../../../common/configs/local.config", ), internal_overrides={ "image": "myflyteimage:v123", "project": "myflyteproject", "domain": "development" }, ): s = t.serialize() assert isinstance(s, _admin_task_pb2.TaskSpec) assert s.template.id.name == "tests.flytekit.unit.common_tests.tasks.test_task.my_task" assert s.template.container.image == "myflyteimage:v123" schema = Types.Schema([("a", Types.String), ("b", Types.Integer)]) def test_task_produce_deterministic_version(): containerless_task = SdkPrestoTask( task_inputs=inputs(ds=Types.String, rg=Types.String), statement= "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10", output_schema=schema, routing_group="{{ .Inputs.rg }}", ) identical_containerless_task = SdkPrestoTask( task_inputs=inputs(ds=Types.String, rg=Types.String), statement= "SELECT * FROM flyte.widgets WHERE ds = '{{ .Inputs.ds}}' LIMIT 10", output_schema=schema,
def test_write(wf_params, a): b = Types.CSV() with b as w: w.write("Hello,world,hi") a.set(b)