Python DataFrame_Functions示例，pyspark_config.spark_utils.functions.DataFrame_Functions Python示例

示例#1

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def cast(self, col: str, newCol: str, fromType: str, toType: str):
     df = DataFrame_Functions.cast(df=self.df,
                                   col=col,
                                   newCol=newCol,
                                   fromType=fromType,
                                   toType=toType)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#2

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def add_date(self, date):
        """
        Creates three columns 'year', 'month', 'day' with
            1  for january,
            2  for february,
            3  for march,
            4  for april,
            5  for may,
            6  for june,
            7  for july,
            8  for august,
            9  for september,
            10 for october,
            11 for november,
            12 for december

        Attributes:
        -----------
        date: String
            Column with DateType

        :return: DataFrame_Extended

        """
        df = DataFrame_Functions.add_date(df=self.df, date=date)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#3

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def join_customized(self, other, left_on: List[str], right_on: List[str],
                     how: str):
     df = DataFrame_Functions.join(left=self.df,
                                   right=other,
                                   left_on=left_on,
                                   right_on=right_on,
                                   how=how)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#4

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def sort(self, column, ascending=False):
        """
        Returns a new :class:`DataFrame_Extended` sorted by the specified column(s).

        :param cols: list of :class:`Column` or column names to sort by.
        :param ascending: boolean or list of boolean (default True).
            Sort ascending vs. descending. Specify list for multiple sort orders.
            If a list is specified, length of the list must equal length of the `cols`.
        """
        df = DataFrame_Functions.sort(df=self.df,
                                      column=column,
                                      ascending=ascending)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#5

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def cluster_df(self,
                cluster_col,
                cluster_list,
                groupby_col_list: list = [],
                sum_col_list: list = [],
                count_col_list: list = []):
     df = DataFrame_Functions.cluster_df(df=self.df,
                                         cluster_col=cluster_col,
                                         cluster_list=cluster_list,
                                         groupby_col_list=groupby_col_list,
                                         sum_col_list=sum_col_list,
                                         count_col_list=count_col_list)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#6

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def applyFastText(self, delimiter, originalCol, column, newCol, size,
                   window, min_count, epoche):
     df = DataFrame_Functions.applyFastText(
         df=self.df,
         spark_session=self.spark_session,
         delimiter=delimiter,
         originalCol=originalCol,
         column=column,
         newCol=newCol,
         size=size,
         window=window,
         min_count=min_count,
         epoche=epoche)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#7

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def add_perc(self, column, perc_name):
        """
        Creates a column called 'percentage' with the percentage
        of the value in the column column.

        Attributes:
        -----------
        column: String
            Numerical Column which the percentage is calculated for

        :return: DataFrame_Functions

        """
        df = DataFrame_Functions.add_perc(df=self.df,
                                          column=column,
                                          perc_name=perc_name,
                                          spark_session=self.spark_session)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#8

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def filter(self, sql_condition):
        """Filters rows using the given condition.

        :func:`where` is an alias for :func:`filter`.

        :param condition: a :class:`Column` of :class:`types.BooleanType`
            or a string of SQL expression.

        df.filter(df.age > 3).collect()
        [Row(age=5, name=u'Bob')]
        df.where(df.age == 2).collect()
        [Row(age=2, name=u'Alice')]

        df.filter("age > 3").collect()
        [Row(age=5, name=u'Bob')]
        df.where("age = 2").collect()
        [Row(age=2, name=u'Alice')]
        """
        df = DataFrame_Functions.filter(df=self.df,
                                        sql_condition=sql_condition)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#9

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def concatenate(self, cols, name, delimiter=""):
        """
        Creates a column with the indicated columns concatenated with the delimiter (Default: "").

        :param df: DataFrame

        :param cols: List[String]
            Columns to concatenate. Column type must be a string
        :param name: String
            Column name of the concatenated column
        :param delimiter: String
            Specifies the boundary between separate columns in the concatenated sequence

        :return: DataFrame
            DataFrame with created concatenation column

        """
        df = DataFrame_Functions.concatenate(df=self.df,
                                             cols=cols,
                                             name=name,
                                             delimiter=delimiter)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#10

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def add_weekday(self, date):
        """
        Creates a numerical column 'weekday' with the
        corresponding weekday with
            1 for monday,
            2 for tuesday,
            3 for wednesday,
            4 for thursday,
            5 for friday,
            6 for saturday,
            7 for sunday

        Attributes:
        -----------
        date: String
            Column with DateType

        :return: DataFrame_Extended

        """
        df = DataFrame_Functions.add_weekday(df=self.df, date=date)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#11

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

    def groupby(self,
                groupBy_col_list,
                sum_col_list: List[str] = None,
                count_col_list: List[str] = None):
        """"Groups the :class:`DataFrame` using the specified columns,
        so we can run aggregation on them. See :class:`GroupedData`
        for all the available aggregate functions.

        :func:`groupby` is an alias for :func:`groupBy`.

        :param groupBy_col_list: list of columns to group by.
            Each element should be a column name (string) or an expression (:class:`Column`).
        :param sum_col_list: list of columns to sum by groups.
            Each element should be a column name (string) or an expression (:class:`Column`).
        :param count_col_list: list of columns to count by groups.
            Each element should be a column name (string) or an expression (:class:`Column`).

        """
        df = DataFrame_Functions.groupby(df=self.df,
                                         groupBy_col_list=groupBy_col_list,
                                         sum_col_list=sum_col_list,
                                         count_col_list=count_col_list)
        return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#12

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def normalization(self, col, newCol):
     df = DataFrame_Functions.normalization(df=self.df,
                                            col=col,
                                            newCol=newCol)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#13

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def one_hot_encoder(self, col, newCol, vocabSize):
     df = DataFrame_Functions.one_hot_encoder(df=self.df,
                                              col=col,
                                              newCol=newCol,
                                              vocabSize=vocabSize)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#14

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def split(self, column, newCol, delimiter):
     df = DataFrame_Functions.split(df=self.df,
                                    column=column,
                                    newCol=newCol,
                                    delimiter=delimiter)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#15

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def collect_list(self, order_by, group_by_list, column_list):
     df = DataFrame_Functions.collect_list(df=self.df,
                                           order_by=order_by,
                                           group_by_list=group_by_list,
                                           column_list=column_list)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#16

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def filter_by_list(self, col: str, choice_list: List[str]):
     df = DataFrame_Functions.filter_by_list(df=self.df,
                                             col=col,
                                             choice_list=choice_list)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#17

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def list_length(self, column):
     df = DataFrame_Functions.list_length(df=self.df, column=column)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)

示例#18

0

显示文件

文件： dataframe_extended.py 项目： Patrizio1301/pyspark-config

 def select(self, cols: List[str]):
     df = DataFrame_Functions.select(df=self.df, cols=cols)
     return DataFrame_Extended(df=df, spark_session=self.spark_session)