Пример #1
0
 def _dataframe_join(left, right, on, how, num_processors=None):
     # make both tables co-partitioned to improve join performance
     left = repartition_df(left,
                           partition_by=on,
                           num_processors=num_processors)
     right = repartition_df(right,
                            partition_by=on,
                            num_processors=num_processors)
     return left.join(right, on=on, how=how)
 def _dataframe_join(
     left: DataFrame,
     right: DataFrame,
     on: List[str],
     how: str,
     num_processors: int = None,
 ) -> DataFrame:
     # make both tables co-partitioned to improve join performance
     left = repartition_df(left,
                           partition_by=on,
                           num_processors=num_processors)
     right = repartition_df(right,
                            partition_by=on,
                            num_processors=num_processors)
     return left.join(right, on=on, how=how)
Пример #3
0
    def test_repartition_df(self, input_df):
        result_df = repartition_df(dataframe=input_df,
                                   partition_by=["timestamp"])

        # Only one partition id, meaning data is not partitioned
        assert input_df.select(spark_partition_id()).distinct().count() == 1
        # Desired number of partitions
        assert result_df.select(spark_partition_id()).distinct().count() == 200
 def _create_partitions(self, dataframe):
     # create year partition column
     dataframe = dataframe.withColumn(
         columns.PARTITION_YEAR, year(dataframe[columns.TIMESTAMP_COLUMN])
     )
     # create month partition column
     dataframe = dataframe.withColumn(
         columns.PARTITION_MONTH, month(dataframe[columns.TIMESTAMP_COLUMN])
     )
     # create day partition column
     dataframe = dataframe.withColumn(
         columns.PARTITION_DAY, dayofmonth(dataframe[columns.TIMESTAMP_COLUMN])
     )
     return repartition_df(dataframe, self.PARTITION_BY, self.num_partitions)
    def _aggregate(
        self,
        dataframe: DataFrame,
        features: List[Feature],
        window: Optional[Window] = None,
        num_processors: int = None,
    ) -> DataFrame:
        aggregations = [
            c.function for f in features for c in f.transformation.aggregations
        ]

        groupby = self.keys_columns.copy()
        if window is not None:
            dataframe = dataframe.withColumn("window", window.get())
            groupby.append("window")
        else:
            groupby.append(self.timestamp_column)

        if self._distinct_subset:
            orderby = (functions.col(self.timestamp_column).desc()
                       if self._distinct_keep == "last" else functions.col(
                           self.timestamp_column).asc())

            partition_window = (sql.Window().partitionBy(
                *groupby, *self._distinct_subset).orderBy(orderby))

            dataframe = dataframe.withColumn(
                "keep_rn",
                functions.row_number().over(partition_window)).filter(
                    "keep_rn = 1")

        # repartition to have all rows for each group at the same partition
        # by doing that, we won't have to shuffle data on grouping by id
        dataframe = repartition_df(
            dataframe,
            partition_by=groupby,
            num_processors=num_processors,
        )
        grouped_data = dataframe.groupby(*groupby)

        if self._pivot_column:
            grouped_data = grouped_data.pivot(self._pivot_column,
                                              self._pivot_values)

        aggregated = grouped_data.agg(*aggregations)
        return self._with_renamed_columns(aggregated, features, window)