def testNumericalParameter(self): p = luigi.NumericalParameter(min_value=-3, max_value=7, var_type=int, config_path=dict(section="foo", name="bar")) self.assertEqual(-3, _value(p))
def test_float_max_value_exclusive(self): d = luigi.NumericalParameter(var_type=float, min_value=-3, max_value=7, left_op=le, right_op=lt) self.assertRaises(ValueError, lambda: d.parse(7))
class BarTask(ShellTask): foo_path = luigi.Parameter() foo_num = luigi.NumericalParameter(var_type=int, min_value=0, max_value=10000) bar_directory = luigi.Parameter() def output(self): return luigi.LocalTarget( os.path.join(self.bar_directory, "bar_%d_success.txt" % self.foo_num)) def run(self): with AtomicFilePointer( os.path.join(self.bar_directory, "bar_%d.txt" % self.foo_num)).open() as bar_file: (returncode, stdout, stderr) = self.ex( "echo \"%d - bar\" > %s" % (self.foo_num, bar_file.tmp_path)) if returncode > 0: raise Exception("Received error code %s: %s -> %s" % (returncode, self.foo_path, self.bar_directory)) with self.output().open('w') as out_file: out_file.write("1")
def test_float_max_value_inclusive(self): d = luigi.NumericalParameter(var_type=float, min_value=-3, max_value=7, left_op=le, right_op=le) self.assertEqual(7, d.parse(7))
def test_int_min_value_inclusive(self): d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7, left_op=le, right_op=lt) self.assertEqual(-3, d.parse(-3))
class Pull_data(lu.Task): v = lu.NumericalParameter(default=0.1, var_type=float, min_value=0, max_value=100) boro = lu.ChoiceParameter(default='Queens', var_type=str, choices=['Queens', 'Brooklyn', 'Manhattan']) prod = lu.BoolParameter() def output(self): prod_ = "prod" if self.prod else 'staging' path = f'data/{prod_}/{self.boro}/raw_{self.v}.csv' path = str(this_folder / path) return lu.LocalTarget(path) # def complete(self): # return self.output().exist() # def requires(self): # return ... def run(self): source = f'https://raw.githubusercontent.com/Codecademy/datasets/master/streeteasy/{self.boro.lower()}.csv' data = pd.read_csv(source) self.output().makedirs() data.to_csv(self.output().path)
class Top10(luigi.Task): date = luigi.DateParameter(default=date.today()) N = luigi.NumericalParameter(default=10, min_value=1, max_value=100, var_type=int) def requires(self): return Collect311(date=self.date) def output(self): return luigi.LocalTarget(f"{folder}/311/top{self.N}.csv") @staticmethod def _analize(df, date, N=10): dict_ = { "boro": "NYC", "date": date, "metric": "complaints", "value": len(df) } stats = [dict_] top_N = df["complaint_type"].value_counts().nlargest(N).to_dict() for k, v in top_N.items(): dict_["metric"] = k dict_["balue"]: v stats.append(copy(dict_)) for boro, group in df.groupby("borough"): dict_["boro"] = boro dict_["metric"] = "complaints" dict_["value"] = len(group) stats.append(copy(dict_)) top_N = group["complaint_type"].value_counts().nlargest( N).to_dict() for k, v in top_N.items(): dict_["metric"] = k dict_["balue"]: v stats.append(copy(dict_)) return stats def run(self): df = pd.read_csv(self.input().path) data = pd.DataFrame(self._analize(df, date=self.date, N=self.N)).set_index("date") data.to_csv(self.output().path)
class FooTask(ShellTask): foo_directory = luigi.Parameter() foo_num = luigi.NumericalParameter(var_type=int, min_value=0, max_value=10000) def output(self): return luigi.LocalTarget( os.path.join(self.foo_directory, "foo_%d.txt" % self.foo_num)) def run(self): with AtomicFilePointer(self.output().path).open() as foo_file: self.run_command("echo %d > %s" % (self.foo_num, foo_file.tmp_path))
class FooWorkflow(luigi.WrapperTask): root_path = luigi.Parameter() foo_num = luigi.NumericalParameter(var_type=int, min_value=0, max_value=10000) def requires(self): foo_dir = os.path.join(self.root_path, 'foo') bar_dir = os.path.join(self.root_path, 'bar') if not os.path.isdir(foo_dir): os.makedirs(foo_dir) if not os.path.isdir(bar_dir): os.makedirs(bar_dir) tasks = [ FooTask(foo_num=self.foo_num), BarTask(foo_path=os.path.join(foo_dir, "foo_%s.txt"), foo_num=self.foo_num) ] return util.sequence_tasks(tasks)
class Top10(luigi.Task): date = luigi.DateParameter(default=date.today()) start = luigi.DateParameter(default=datetime(2019, 1, 1)) N = luigi.NumericalParameter(default=5, min_value=1, max_value=100, var_type=int) def requires(self): # data for the last {window} days delta = self.date - self.start dates = [self.start + timedelta(days=d) for d in range(delta.days + 1)] return {d.strftime('%Y-%m-%d'): Collect311(date=(d)) for d in dates} def output(self): return { 'report': luigi.LocalTarget(f'{folder}/311/top{self.N}.csv'), 'flag': luigi.LocalTarget( f'{folder}/311/_flags/{self.date:%Y/%m/%d}_{self.N}.flag') } @staticmethod def _analize(df, date, N=10): dict_ = { 'boro': 'NYC', 'date': date, 'metric': 'complaints', 'value': len(df) } stats = [ dict_, ] top_N = df["complaint_type"].value_counts().nlargest(N).to_dict() for k, v in top_N.items(): dict_['metric'] = k dict_['balue']: v stats.append(copy(dict_)) for boro, group in df.groupby('borough'): dict_['boro'] = boro dict_['metric'] = 'complaints' dict_['value'] = len(group) stats.append(copy(dict_)) top_N = group["complaint_type"].value_counts().nlargest( N).to_dict() for k, v in top_N.items(): dict_['metric'] = k dict_['balue']: v stats.append(copy(dict_)) return stats def run(self): data = [] for k, v in self.input().items(): try: df = pd.read_csv(v.path) stats = self._analize(df, date=k, N=self.N) data.extend(stats) except Exception as e: # print(e) pass data = pd.DataFrame(data) print(data.columns) data = data.set_index('date') # self.output()['report'].makedirs() data.to_csv(self.output()['report'].path) with self.output()['flag'].open('w') as f: f.write('!') def complete(self): return self.output()['flag'].exists()
def test_var_type_parameter_exception(self): self.assertRaises( luigi.parameter.ParameterException, lambda: luigi.NumericalParameter(min_value=-3, max_value=7))
def test_endpoint_default_exclusive(self): d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7) self.assertRaises(ValueError, lambda: d.parse(7))
def test_defaults_start_range(self): d = luigi.NumericalParameter(var_type=int, min_value=-3, max_value=7) self.assertEqual(-3, d.parse(-3))
class Train_Model(lu.Task): v = lu.NumericalParameter(default=0.1, var_type=float, min_value=0, max_value=100) boro = lu.ChoiceParameter(default='Queens', var_type=str, choices=['Queens', 'Brooklyn', 'Manhattan']) prod = lu.BoolParameter() def output(self): prod_ = "prod" if self.prod else 'staging' path = this_folder / f'data/{prod_}/{self.boro}/model_{self.v}' return { 'metrics': lu.LocalTarget(str(path / 'metrics.json')), 'predicted': lu.LocalTarget(str(path / 'predicted.csv')), 'model': lu.LocalTarget(str(path / 'model.pkl')) } def requires(self): return Pull_data(boro=self.boro, prod=self.prod, v=0.1) def run(self): df = pd.read_csv(self.input().path) y = df['rent'] X = df[[ 'bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee' ]] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=2019) model = XGBRegressor(random_state=2019, max_depth=10, n_estimators=1000) model.fit(X_train, y_train) pred = model.predict(X_test) metrics = { 'max_depth': 10, 'n_extimators': 1000, 'test_mae': mean_absolute_error(y_test, pred) } X_test['predicted'] = pred self.output()['predicted'].makedirs() X_test.to_csv(self.output()['predicted'].path) with open(self.output()['metrics'].path, 'w') as f: json.dump(metrics, f) with open(self.output()['model'].path, 'wb') as f: pickle.dump(model, f)