def add_elapsed(df: pyspark.sql.DataFrame,
                cols: List[str]) -> pyspark.sql.DataFrame:
    def add_elapsed_column(col, asc):
        def fn(rows):
            last_store, last_date = None, None
            for r in rows:
                if last_store != r.Store:
                    last_store = r.Store
                    last_date = r.Date
                if r[col]:
                    last_date = r.Date
                fields = r.asDict().copy()
                fields[("After" if asc else "Before") + col] = (r.Date -
                                                                last_date).days
                yield Row(**fields)

        return fn

    # repartition: rearrange the rows in the DataFrame based on the partitioning expression
    # sortWithinPartitions: sort every partition in the DataFrame based on specific columns
    # mapPartitions: apply the 'add_elapsed_column' method to each partition in the dataset, and convert the partitions into a DataFrame
    df = df.repartition(df.Store)
    for asc in [False, True]:
        sort_col = df.Date.asc() if asc else df.Date.desc()
        rdd = df.sortWithinPartitions(df.Store.asc(), sort_col).rdd
        for col in cols:
            rdd = rdd.mapPartitions(add_elapsed_column(col, asc))
        df = rdd.toDF()
    return df
예제 #2
0
    def save_to_parquet(self,
                        df: pyspark.sql.DataFrame,
                        name: str,
                        mode: str = "overwrite",
                        num_partitions: int = None,
                        partition_cols: List[str] = None,
                        pre_final: bool = False):
        """Saves a DataFrame into a parquet file.

        Args:
            df (pyspark.sql.DataFrame):
            name (str):
            mode (str):
            num_partitions (int):
            partition_cols (list):
            pre_final (bool):
        """

        logger.debug(
            "Saving %s to parquet.." %
            name if not pre_final else "Saving %s.pre_final to parquet.." %
            name)
        path = os.path.join(self.df_data_folder, name, str(self.loop_counter))
        if not os.path.exists(path):
            os.makedirs(path)
        if pre_final:
            parquet_name = os.path.join(path, name + ".pre_final.parquet")
        else:
            parquet_name = os.path.join(path, name + ".parquet")

        if partition_cols and num_partitions:
            df.repartition(
                num_partitions,
                *partition_cols).write.mode(mode).parquet(parquet_name)
        elif num_partitions and not partition_cols:
            df.repartition(num_partitions).write.mode(mode).parquet(
                parquet_name)
        elif partition_cols and not num_partitions:
            df.repartition(
                *partition_cols).write.mode(mode).parquet(parquet_name)
        else:
            df.repartition(1).write.mode(mode).parquet(parquet_name)
예제 #3
0
    def save_to_ray(self, df: pyspark.sql.DataFrame,
                    num_shards: int) -> PandasDataset:
        # call java function from python
        df = df.repartition(num_shards)
        sql_context = df.sql_ctx
        jvm = sql_context.sparkSession.sparkContext._jvm
        jdf = df._jdf
        object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter(
            jdf)
        records = object_store_writer.save()

        worker = ray.worker.global_worker

        blocks: List[ray.ObjectRef] = []
        block_sizes: List[int] = []
        for record in records:
            owner_address = record.ownerAddress()
            object_id = ray.ObjectID(record.objectId())
            num_records = record.numRecords()
            # Register the ownership of the ObjectRef
            worker.core_worker.deserialize_and_register_object_ref(
                object_id.binary(), ray.ObjectRef.nil(), owner_address)

            blocks.append(object_id)
            block_sizes.append(num_records)

        divided_blocks = divide_blocks(block_sizes, num_shards)
        record_batch_set: List[RecordBatch] = []
        for i in range(num_shards):
            indexes = divided_blocks[i]
            object_ids = [blocks[index] for index in indexes]
            record_batch_set.append(RecordBatch(object_ids))

        # TODO: we should specify the resource spec for each shard
        ds = parallel_dataset.from_iterators(generators=record_batch_set,
                                             name="spark_df")

        def resolve_fn(it: "Iterable[RecordBatch]") -> "Iterator[RecordBatch]":
            for item in it:
                item.resolve()
                yield item

        return ds.transform(resolve_fn,
                            ".RecordBatch#resolve()").flatten().to_pandas(None)