Пример #1
0
    def __init__(self, df, path_file, pu=1):
        """Constructor.
        In this method DataFrameTransformer and DataFrameAnalyzer classes are
        instantiated.
        """
        # Path file:
        self._path_file = path_file

        # Pu file:
        self._pu = pu

        # Instance of transformer class:
        self.transformer = DataFrameTransformer(df)

        # Instance of analyzer class:
        self.analyzer = DataFrameAnalyzer(df, self._path_file, self._pu)
Пример #2
0
class Optimus:
    """This class can be considered as a wrapper of DataFrameTransformer and
    DataFrameAnalyzer classes.
    """
    def __init__(self, df, path_file, pu=1):
        """Constructor.
        In this method DataFrameTransformer and DataFrameAnalyzer classes are
        instantiated.
        """
        # Path file:
        self._path_file = path_file

        # Pu file:
        self._pu = pu

        # Instance of transformer class:
        self.transformer = DataFrameTransformer(df)

        # Instance of analyzer class:
        self.analyzer = DataFrameAnalyzer(df, self._path_file, self._pu)

    def get_data_frame(self):
        """This function return the dataframe of the class"""
        return self.transformer.df

    def set_data_frame(self, df):
        """This function set a dataframe into the class for subsequent actions.
        """
        assert isinstance(
            df, DataFrame
        ), "Error: df argument must a pyspark.sql.dataframe.DataFrame type"
        self.transformer.set_data_frame(df)

    def _execute_analyzer(self, columns):
        # First
        self.analyzer.unpersist_df()
        del self.analyzer
        # Instance of analyzer class:
        self.analyzer = DataFrameAnalyzer(self.transformer.df, self._path_file,
                                          self._pu)
        # Sampling if it is specified (this method is dependent to the pu argument)
        self.analyzer.analyze_sample()
        # Analyze column specified by user:
        self.analyzer.column_analyze(column_list=columns,
                                     plots=True,
                                     values_bar=False,
                                     print_type=True,
                                     num_bars=50)

    def trim_col(self, columns):
        """This methods cut left and right extra spaces in column strings provided by user.
        :param columns   list of column names of dataFrame.
                        If a string "*" is provided, the method
                        will do the trimming operation in whole dataFrame.

        :return transformer object
        """
        # Calling trimCol
        self.transformer.trim_col(columns=columns)
        # Execute analyzer:
        self._execute_analyzer(columns)

    def drop_col(self, columns):
        """This method eliminate the list of columns provided by user.
        :param columns      list of columns names or a string (a column name).

        :return transformer object
        """

        # Calling dropCol
        self.transformer.drop_col(columns=columns)
        # Execute analyzer:
        self._execute_analyzer(columns)

    def lower_case(self, columns):
        """This function set all strings in columns of dataframe specified to lowercase.
        Columns argument must be a string or a list of string. In order to apply this
        function to all dataframe, columns must be equal to '*'"""

        # Calling lowerCase
        self.transformer.lower_case(columns=columns)
        # Execute analyzer:
        self._execute_analyzer(columns)

    def upper_case(self, columns):
        """This function set all strings in columns of dataframe specified to uppercase.
        Columns argument must be a string or a list of string. In order to apply this function to all
        dataframe, columns must be equal to '*'"""

        # Calling lowerCase
        self.transformer.upper_case(columns=columns)
        # Execute analyzer:
        self._execute_analyzer(columns)

    def keep_col(self, columns):
        """This method keep only columns specified by user with columns argument in DataFrame.
        :param columns list of columns or a string (column name).

        :return transformer object
        """

        # Calling keepCol
        self.transformer.keep_col(columns=columns)
        # Execute analyzer:
        self._execute_analyzer(columns)