def main(self, ds): df = ds.get("titanic").content df[self.column] = df[self.column].fillna(df[self.column].median()) rds = DataSet() rds.put(f"fillna_{self.column}", DataFrameData(df[self.column])) return rds
def main(self, ds): repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds = DataSet() ds.put("titanic", titanic_data) return ds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds
def main(self, ds): df = ds.get("titanic").content df.loc[df["Sex"] == "male", "Sex"] = 0 df.loc[df["Sex"] == "female", "Sex"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) return rds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df.loc[df["Embarked"] == "S", "Embarked"] = 0 df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) return rds
def main(self, ds): df = ds.get("titanic").content df["Sex"][df["Sex"] == "male"] = 0 df["Sex"][df["Sex"] == "female"] = 1 rds = DataSet() rds.put("sex_to_code", DataFrameData(df["Sex"])) time.sleep(random.randint(3, 10)) return rds
def main(self, ds): df = ds.get("titanic").content df = df.drop(["Age", "Sex", "Embarked"], axis=1) df = df.join(ds.get("fillna_Age").content) df = df.join(ds.get("sex_to_code").content) df = df.join(ds.get("embarked_to_code").content) rds = DataSet() rds.put("titanic_result", DataFrameData(df)) return rds
def main(self, ds): df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df["Embarked"][df["Embarked"] == "S"] = 0 df["Embarked"][df["Embarked"] == "C"] = 1 df["Embarked"][df["Embarked"] == "Q"] = 2 rds = DataSet() rds.put("embarked_to_code", DataFrameData(df["Embarked"])) time.sleep(random.randint(3, 10)) return rds
def main(self, ds): logger.info("execute TaskA") if random.randint(0, 2) == 0: self._output_datakeys = ["DataA-1"] else: self._output_datakeys = ["DataA-2"] return DataSet()
def af_entrypoint(gt, ds, **kwargs): logger.info(f"ds={ds}") pprint(kwargs) # build Input DataSet from Xcoms ti = kwargs["ti"] ds = DataSet() for d in gt.dependencies: cn = d.task.__class__.__name__ pull_ds = ti.xcom_pull(task_ids=cn) if pull_ds: logger.info(f"found Xcom from {cn}") ds.merge(pull_ds) # Run & retrun Xcoms return gt.task.main(ds)
def __init__(self, catalog_ds: DataSet = None, disable_dynamic_dep: bool = False): """.ctor """ self.graph = [] self.error_handlers = [] self.abort = False self.catalog_ds = catalog_ds if catalog_ds else DataSet() self.disable_dynamic_dep = disable_dynamic_dep
def test_default_ds(self): class TestTask(Task): def main(self, ds: DataSet): if "default" not in ds.keys(): raise ValueError() return DataSet() # default_ds = DataSet().put("default", JsonData({})) g = Graph() g.append(TestTask()) g.run(default_ds)
def test_catalog_ds(self): class TestTask(Task): def main(self, ds: DataSet): if "catalog" not in ds.keys(): raise ValueError() return DataSet() catalog_ds = DataSet().put("catalog", JsonData({})) g = Graph(catalog_ds=catalog_ds) g.append(TestTask()) g.run()
def gmain(self, d: DataSet) -> DataSet: """タスクの処理(Graphから呼び出す用) Args: d (DataSet): 入力DataSet Returns: DataSet: 出力DataSet """ self._in = {} # DataSet 自動展開 if d: for k in d.keys(): self._in[k] = d.get(k).content # 実行 start = time.time() r = self.main(d) self._elapsed_time = time.time() - start return r
def _make_task_inputs(self, graph_task: GraphTask, default_ds: DataSet) -> DataSet: """タスクの入力DataSet作成 Args: graph_task (Task): GraphTask default_ds (DataSet): デフォルトDataSet Returns: DataSet: 入力DataSet """ ds = DataSet() ds.merge(self.catalog_ds) if not graph_task.dependencies: ds.merge(default_ds) else: for d in graph_task.dependencies: ds.merge(d.output_ds) return ds
df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df.loc[df["Embarked"] == "S", "Embarked"] = 0 df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み ds = DataSet() repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds.put("titanic", titanic_data) # print("## Original data") print(ds.get("titanic").content) # 欠損処理・性別/乗船した港のコード変換等いくつかの処理をチェーンするサンプル if True: ds = FillNaMedian("Age").main(ds) ds = SexToCode().main(ds) ds = EmbarkedToCode().main(ds) else:
def main(self, ds): logger.info("execute TaskC") return DataSet()
repo_s = SqlAlchemyRepository(engine) md = SqlAlchemyModelData(repo_s, Titanic) md.update_dataframe(titanic_data.content) md.save() if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み・DBの準備 engine = create_engine("sqlite:///example.sqlite3", echo=True) prepare_db(engine) repo = SqlAlchemyRepository(engine) d = SqlAlchemyModelData(repo, Titanic) d.query() passenger_ids = [m.PassengerId for m in d.content] # データを一行ずつ SQLAlchemy のモデルに取り出し、処理して書き戻す例 for passenger_id in passenger_ids: d.query(lambda x: x.filter(Titanic.PassengerId == passenger_id)) ds = DataSet() ds.put("titanic", d) ds = SexToCode().main(ds) ds = EmbarkedToCode().main(ds) ds.save_all()
df = ds.get("titanic").content df["Embarked"] = df["Embarked"].fillna("S") df.loc[df["Embarked"] == "S", "Embarked"] = 0 df.loc[df["Embarked"] == "C", "Embarked"] = 1 df.loc[df["Embarked"] == "Q", "Embarked"] = 2 ds = DataSet() ds.put("titanic", DataFrameData(df)) return ds if __name__ == "__main__": basicConfig(level=DEBUG) # データセットの読み込み ds = DataSet() repo = LocalFileRepository( Path(os.path.dirname(__file__)) / Path("../titanic.csv")) titanic_data = DataFrameData.load(repo) ds.put("titanic", titanic_data) # print("## Original data") print(ds.get("titanic").content) # Graphで処理する # Age欠損埋め -> 性別のコード化 -> 乗船した港 のコード化 の順で処理 graph = Graph() fill_age = graph.append(FillNaMedian("Age")) sex_to_code = graph.append(SexToCode(), [fill_age]) graph.append(EmbarkedToCode(), [sex_to_code])
def __init__(self, base_dir: Path, catalog_ds: DataSet = DataSet()): super().__init__(catalog_ds) logger.info(f"set up DebugGraph, base_dir={base_dir.name}") self.base_dir = base_dir
def main(self, ds: DataSet): if "default" not in ds.keys(): raise ValueError() return DataSet()
def main(self, ds: DataSet): self._output_datakeys = ["DynTaskA"] return DataSet()
def main(self, ds: DataSet): return DataSet()
def __init__(self, executor=None, catalog_ds: DataSet = DataSet(), disable_dynamic_dep: bool = False): super().__init__(catalog_ds, disable_dynamic_dep=disable_dynamic_dep) self.pool = executor if self.pool is None: self.pool = ThreadPoolExecutor()
def main(self, ds): return DataSet().put("DataC", JsonData({}))