class MultiRegionHousePricePredictionModelTrainer(object):
    """
    This pipeline trains an XGBoost model, also generated synthetic data and runs predictions against test dataset
    """
    regions = Input(Types.List(Types.String),
                    default=["SFO", "SEA", "DEN"],
                    help="Regions for where to train the model.")
    seed = Input(Types.Integer, default=7, help="Seed to use for splitting.")
    num_houses_per_region = Input(
        Types.Integer,
        default=1000,
        help="Number of houses to generate data for in each region")

    # the actual algorithm
    split = generate_and_split_data_multiloc(
        locations=regions,
        number_of_houses_per_location=num_houses_per_region,
        seed=seed)
    fit_task = parallel_fit(multi_train=split.outputs.train)
    predicted = parallel_predict(multi_models=fit_task.outputs.multi_models,
                                 multi_test=split.outputs.test)

    # Outputs: joblib seralized models per region and accuracy of the model per region
    # Note we should make this into a map, but for demo we will output a simple list
    models = Output(fit_task.outputs.multi_models,
                    sdk_type=Types.List(Types.Blob))
    accuracies = Output(predicted.outputs.accuracies,
                        sdk_type=Types.List(Types.Float))
Пример #2
0
from flytekit.sdk.tasks import inputs, outputs, python_task
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import workflow_class, Input, Output
import json


@inputs(custom=Types.Generic)
@outputs(counts=Types.Generic, replicated=Types.List(Types.Generic))
@python_task
def generic_type_task(wf_params, custom, counts, replicated):
    """
    Go through each of the values of the input and if it's a str, count the length
    Also, create a replicated list of the Generic
    """
    wf_params.logging.info("Running custom object task")
    results = {}
    for k, v in custom.items():
        if type(v) == str:
            results[k] = len(v)
        else:
            results[k] = v

    counts.set(results)
    replicated.set([custom, custom])


@inputs(replicated=Types.List(Types.Generic))
@outputs(str_repr=Types.String)
@python_task
def generic_to_json(wf_params, replicated, str_repr):
    """
import os

from flytekit.sdk.tasks import python_task, inputs, outputs, dynamic_task
from flytekit.sdk.types import Types
from flytekit.sdk.workflow import workflow_class, Input, Output

from demo.house_price_predictor import generate_data, save_to_file, save_to_dir, fit, predict


@inputs(locations=Types.List(Types.String),
        number_of_houses_per_location=Types.Integer,
        seed=Types.Integer)
@outputs(train=Types.List(Types.MultiPartCSV),
         val=Types.List(Types.MultiPartCSV),
         test=Types.List(Types.CSV))
@python_task(cache=True, cache_version="0.1", memory_request="200Mi")
def generate_and_split_data_multiloc(wf_params, locations,
                                     number_of_houses_per_location, seed,
                                     train, val, test):
    train_sets = []
    val_sets = []
    test_sets = []
    for loc in locations:
        _train, _val, _test = generate_data(loc, number_of_houses_per_location,
                                            seed)
        dir = "multi_data"
        os.makedirs(dir, exist_ok=True)
        train_sets.append(save_to_dir(dir, "train", _train))
        val_sets.append(save_to_dir(dir, "val", _val))
        test_sets.append(save_to_file(dir, "test", _test))
    train.set(train_sets)
Пример #4
0
    # We know we are writing just one file, so we will just read the one file
    df = pd.read_csv(os.path.join(train.local_path, files[0]), header=None)
    y = df[df.columns[0]]
    x = df[df.columns[1:]]
    # fit model no training data
    m = XGBClassifier()
    m.fit(x, y)

    # TODO model Blob should be a file like object
    fname = "model.joblib.dat"
    joblib.dump(m, fname)
    model.set(fname)


@inputs(test=Types.CSV, model_ser=Types.Blob)  # TODO: format=".joblib.dat"))
@outputs(predictions=Types.List(Types.Float), accuracy=Types.Float)
@python_task(cache_version='1.0', cache=True, memory_request="200Mi")
def predict(ctx, test, model_ser, predictions, accuracy):
    """
    Given a any trained model, serialized using joblib (this method can be shared!) and features, this method returns
    predictions.
    """
    # Load model
    model_ser.download()
    model = joblib.load(model_ser.local_path)
    # Load test data
    test.download()
    test_df = pd.read_csv(test.local_path, header=None)
    x_df = test_df[test_df.columns[1:]]
    y_df = test_df[test_df.columns[0]]
    y_pred = model.predict(x_df)