class MakeDatasets(Task): TEST_AS_PERCENT_OF_DATASET = 0.20 dir_path = luigi.Parameter(default="data") requires = Requires() output = TargetOutput( file_pattern="{task.dir_path}/{task.__class__.__name__}/", target_class=ParquetTarget, glob="*.parquet", )
class TransformData(Task): dir_path = luigi.Parameter(default="data") requires = Requires() source_data = Requirement(ExtractFeatures) output = TargetOutput( file_pattern="{task.dir_path}/{task.__class__.__name__}/", target_class=ParquetTarget, glob="*.parquet", ) def run(self): ddf = self.input()["source_data"].read_dask() ddf = transform_dataframe(ddf) self.output().write_dask(ddf, compression='gzip')
class CleanData(Task): dir_path = luigi.Parameter(default="data") requires = Requires() source_data = Requirement(DownloadData) output = TargetOutput( file_pattern="{task.dir_path}/{task.__class__.__name__}/", target_class=ParquetTarget, glob="*.parquet", ) def run(self): ddf = self.input()["source_data"].read_dask() ddf = clean_datasets(ddf) self.output().write_dask(ddf, compression='gzip')
class TrainModel(Task): dir_path = luigi.Parameter(default="data") model_path = luigi.Parameter(default="data/Model/model.pckl") requires = Requires() source_data = Requirement(MakeTrainingSet) def output(self): return LocalTarget(self.model_path) def run(self): train_ddf = self.input()["source_data"].read_dask() model = train_model(train_ddf) self.output().makedirs() with self.output().temporary_path() as temp_output_path: print(temp_output_path) pickle.dump(model, open(temp_output_path, 'wb'))
class VisualizePredictions(Task): dir_path = luigi.Parameter(default="data") prediction_visualization_path = luigi.Parameter(default="data/VisualizePredictions/predictions.png") requires = Requires() source_data_testset = Requirement(MakeTestSet) source_predictions = Requirement(EvaluateModel) def output(self): return LocalTarget(self.prediction_visualization_path) def run(self): test_ddf = self.input()["source_data_testset"].read_dask() y_predicted = np.load(self.input()["source_predictions"].path, allow_pickle=True) fig = visualizepredictions(y_predicted, test_ddf) self.output().makedirs() # https://mattiacinelli.com/tutorial-on-luigi-part-3-pipeline-input-and-output/ fig.savefig(self.output().path)
class EvaluateModel(Task): dir_path = luigi.Parameter(default="data") predicted_values_path = luigi.Parameter(default="data/EvaluateModel/predicted.npy") requires = Requires() source_data_testset = Requirement(MakeTestSet) source_model = Requirement(TrainModel) def output(self): return LocalTarget(self.predicted_values_path) def run(self): test_ddf = self.input()["source_data_testset"].read_dask() with open(self.input()["source_model"].fn, "rb") as file: model = pickle.load(file) y_predicted = evaluate_model(model, test_ddf) self.output().makedirs() np.save(self.output().path, y_predicted)
class VisualizeFeatureImportance(Task): dir_path = luigi.Parameter(default="data") importance_path = luigi.Parameter(default="data/VisualizeFeatureSignificance/featureimportance.png") requires = Requires() source_data_testset = Requirement(MakeTestSet) source_model = Requirement(TrainModel) def output(self): return LocalTarget(self.importance_path) def run(self): test_ddf = self.input()["source_data_testset"].read_dask() with open(self.input()["source_model"].fn, "rb") as file: model = pickle.load(file) fig = visualizefeaturesignificance(model, test_ddf) self.output().makedirs() # https://mattiacinelli.com/tutorial-on-luigi-part-3-pipeline-input-and-output/ fig.savefig(self.output().path)