def repartitionDF(self, df: DataFrame, partitions: int = 0): ''' Repartition the inuput dataframe parms: df -> dataframe partitions -> new partitions count. Defaulted to 0 i.e Don't partition logic, if partitions = 0 , Don't repartitions if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2) if partitions > 0 , Repartition/coalesce to the input number ''' curParts = df.rdd.getNumPartitions finalParts = min(curParts, partitions) if curParts == partitions or partitions == 0: finalParts = -1 elif partitions == -1: finalParts = self.__dfltRDDParts elif partitions > 0: finalParts = partitions else: pass #finalParts is pre-populated. self.log("Current Partitions: %d , Requested: %d, Final: %d " % (curParts, partitions, finalParts)) if finalParts != -1: return df elif curParts > finalParts: return df.coalesce(finalParts) else: return df.repartition(finalParts)
def aggregate_dataset_by_year(joined_df: DataFrame) -> DataFrame: """aggregating the data based on 'PHYSICALID' and 'Year' of issue date and sorting based on 'PHYSICALID'""" # df = joined_df.withColumn("year", F.year(F.to_date(F.col("Issue Date"), "MM/dd/yyyy"))) df = (joined_df.repartition(5, "year").groupBy("PHYSICALID").pivot( "year", [2015, 2016, 2017, 2018, 2019]).sum("total_cnt")) df = (df.withColumn( "2015", F.when(F.col("2015").isNull(), 0).otherwise( F.col("2015"))).withColumnRenamed("2015", "COUNT_2015").withColumn( "2016", F.when(F.col("2016").isNull(), 0).otherwise(F.col("2016"))).withColumnRenamed( "2016", "COUNT_2016").withColumn( "2017", F.when(F.col("2017").isNull(), 0).otherwise( F.col("2017"))).withColumnRenamed( "2017", "COUNT_2017").withColumn( "2018", F.when(F.col("2018").isNull(), 0).otherwise(F.col("2018"))). withColumnRenamed("2018", "COUNT_2018").withColumn( "2019", F.when(F.col("2019").isNull(), 0).otherwise(F.col("2019"))).withColumnRenamed( "2019", "COUNT_2019").sort("PHYSICALID")) return df
def repartition_df( dataframe: DataFrame, partition_by: List[str], num_partitions: int = None, num_processors: int = None, ): """Partition the DataFrame. Args: dataframe: Spark DataFrame. partition_by: list of partitions. num_processors: number of processors. num_partitions: number of partitions. Returns: Partitioned dataframe. """ num_partitions = _num_partitions_definition(num_processors, num_partitions) return dataframe.repartition(num_partitions, *partition_by)