def add_elapsed(df: pyspark.sql.DataFrame, cols: List[str]) -> pyspark.sql.DataFrame: def add_elapsed_column(col, asc): def fn(rows): last_store, last_date = None, None for r in rows: if last_store != r.Store: last_store = r.Store last_date = r.Date if r[col]: last_date = r.Date fields = r.asDict().copy() fields[("After" if asc else "Before") + col] = (r.Date - last_date).days yield Row(**fields) return fn # repartition: rearrange the rows in the DataFrame based on the partitioning expression # sortWithinPartitions: sort every partition in the DataFrame based on specific columns # mapPartitions: apply the 'add_elapsed_column' method to each partition in the dataset, and convert the partitions into a DataFrame df = df.repartition(df.Store) for asc in [False, True]: sort_col = df.Date.asc() if asc else df.Date.desc() rdd = df.sortWithinPartitions(df.Store.asc(), sort_col).rdd for col in cols: rdd = rdd.mapPartitions(add_elapsed_column(col, asc)) df = rdd.toDF() return df
def save_to_parquet(self, df: pyspark.sql.DataFrame, name: str, mode: str = "overwrite", num_partitions: int = None, partition_cols: List[str] = None, pre_final: bool = False): """Saves a DataFrame into a parquet file. Args: df (pyspark.sql.DataFrame): name (str): mode (str): num_partitions (int): partition_cols (list): pre_final (bool): """ logger.debug( "Saving %s to parquet.." % name if not pre_final else "Saving %s.pre_final to parquet.." % name) path = os.path.join(self.df_data_folder, name, str(self.loop_counter)) if not os.path.exists(path): os.makedirs(path) if pre_final: parquet_name = os.path.join(path, name + ".pre_final.parquet") else: parquet_name = os.path.join(path, name + ".parquet") if partition_cols and num_partitions: df.repartition( num_partitions, *partition_cols).write.mode(mode).parquet(parquet_name) elif num_partitions and not partition_cols: df.repartition(num_partitions).write.mode(mode).parquet( parquet_name) elif partition_cols and not num_partitions: df.repartition( *partition_cols).write.mode(mode).parquet(parquet_name) else: df.repartition(1).write.mode(mode).parquet(parquet_name)
def save_to_ray(self, df: pyspark.sql.DataFrame, num_shards: int) -> PandasDataset: # call java function from python df = df.repartition(num_shards) sql_context = df.sql_ctx jvm = sql_context.sparkSession.sparkContext._jvm jdf = df._jdf object_store_writer = jvm.org.apache.spark.sql.raydp.ObjectStoreWriter( jdf) records = object_store_writer.save() worker = ray.worker.global_worker blocks: List[ray.ObjectRef] = [] block_sizes: List[int] = [] for record in records: owner_address = record.ownerAddress() object_id = ray.ObjectID(record.objectId()) num_records = record.numRecords() # Register the ownership of the ObjectRef worker.core_worker.deserialize_and_register_object_ref( object_id.binary(), ray.ObjectRef.nil(), owner_address) blocks.append(object_id) block_sizes.append(num_records) divided_blocks = divide_blocks(block_sizes, num_shards) record_batch_set: List[RecordBatch] = [] for i in range(num_shards): indexes = divided_blocks[i] object_ids = [blocks[index] for index in indexes] record_batch_set.append(RecordBatch(object_ids)) # TODO: we should specify the resource spec for each shard ds = parallel_dataset.from_iterators(generators=record_batch_set, name="spark_df") def resolve_fn(it: "Iterable[RecordBatch]") -> "Iterator[RecordBatch]": for item in it: item.resolve() yield item return ds.transform(resolve_fn, ".RecordBatch#resolve()").flatten().to_pandas(None)