def partition_and_write(self, cluster_spec: ClusterSpec, df: DataFrame) -> Result[ExecutedJob, InvalidJob]: if len(self.partition_columns) > 0: def write_partitioned(df: PDataFrame) -> Result[ExecutedJob, InvalidJob]: ddf = dd.from_pandas(df, npartitions=cluster_spec.num_workers() or self.partitions or 10) labels = {} for p in self.partition_columns: value = df[p].unique()[0] labels[p] = str(value) label = "&".join(list(map(lambda item: "=".join(item), labels.items()))) result = df_to(ddf, self.output_path, self.output_format, label) return result def repartition_and_write(df: DataFrame) -> Result[ExecutedJob, InvalidJob]: pc = self.partition_columns df.groupby(pc).apply(write_partitioned, meta=str).compute() return Success(ExecutedJob(f"Repartitioned dataframe according to columns {self.partition_columns} and output to {self.output_path}")) check = self.check_columns(df, self.partition_columns) return flatten(check.map(lambda b: repartition_and_write(b))) else: repartitioned = self.repartition(df, cluster_spec) return flatten(repartitioned.map(lambda b: df_to(b, self.output_path, self.output_format)))
def df_filter_columns(self, cluster_spec: ClusterSpec, df: DataFrame) -> Result[DataFrame, InvalidJob]: if len(self.filter_columns) > 0: check = self.check_columns(df, self.filter_columns) def filter_and_repartition(df: DataFrame) -> Result[DataFrame, InvalidJob]: filtered = df[self.filter_columns] repartitioned = self.repartition(filtered, cluster_spec) return repartitioned return flatten(check.map(lambda b: filter_and_repartition(b))) else: return Success(df)
def test_flatten_context(container, merged): """Ensures that `flatten` is always returning the correct type.""" assert flatten(container)(...) == merged(...)
def test_flatten_context(): """Ensures that `join` works with Context.""" assert flatten( Context.unit(Context.unit(1)), )(Context.Empty) == 1
def test_flatten(container, merged): """Ensures that `join` is always returning the correct type.""" assert flatten(container) == merged