Exemplo n.º 1
0
    def remove_columns_containing_all_nan_values(df, threshold=80):
        """
        receives a dataframe and threshold,
        removes column which contains nan >=threshold
        :param df: original dataframe containing data
        :param threshold: nans threshold from 0-100 as percentage
        :return: return dataframe after removing columns
        """
        try:
            null_counts = \
                df.select(
                    [funct.count(funct.when(funct.col(col).isNull() |
                                            funct.col(col).contains("NULL") |
                                            funct.col(col).contains("null") |
                                            funct.col(col).contains("Null") |

                                            funct.col(col).contains("None") |
                                            funct.col(col).contains("NONE") |
                                            funct.col(col).contains("none"), col)).alias(col) for col in
                     df.columns]).collect()[
                    0].asDict()
            size_df = df.count()
            to_drop = [k for k, v in null_counts.items() if ((v / size_df) * 100) >= threshold]
            print("check")
            logger.warn("columns to drop ")
            logger.warn(to_drop)
            df = df.drop(*to_drop)
            return df
        except Exception as e:
            logger.error(e)
Exemplo n.º 2
0
    def read(self, address="", local="yes", file_format="csv", s3={}):
        """

        :param address:
        :param local:
        :param file_format:
        :param s3:
        :return:
        """
        try:
            if local == "yes":
                """
                Time to read the file saved locally
                """
                rf = ReadFileFromLocal()
                self.dataframe= rf.read(address, file_format)

            elif s3 != {}:
                """
                Time to read data from s3
                """
                self.dataframe = ReadFileFromS3(address, file_format, s3)

            else:
                """
                Not sure where the file is saved.
                """
                message = "Please make sure you have file saved on either your local system or s3."
                logger.debug(message)
                self.dataframe = {"success": False, "message": message}

            return self.dataframe
        except Exception as e:
            logger.error(e)
Exemplo n.º 3
0
    def removing_stop_words(self, x, base_url):
        """
        url column and base_url is given and cleaned url is returned

        :param x: row on which cleaning is need to be performed
        :param base_url: Contains base_url
        :return: cleaned url
        """
        try:
            # If base_url param is empty figure out base_url using urllib
            if base_url == '':
                base_url = urlparse(x)
                base_url = base_url.netloc if base_url.scheme != '' else base_url.path.split("/")[0]
            x = x.replace("https://", "").replace("http://", "").replace(base_url, "")

            # fetch only alphabets ignore all special characters
            tokens = re.findall(r"[\w:']+", x)

            # remove duplicate words from url
            tokens = list(dict.fromkeys(tokens))

            # remove stop words from url
            elem = [word for word in tokens if word not in self.stop_words]

            # add base_url to the url
            elem.insert(0, base_url)

            return '/'.join(elem)
        except Exception as e:
            logger.error(e)
Exemplo n.º 4
0
 def date_formatting(self, x):
     """
     dateutill library is used to convert the different format of dates into standard format
     :param x: row wise date values
     :return: standard format of date
     """
     try:
         return str(parser.parse(x))
     except Exception as e:
         logger.error(e)
         return str(x)
Exemplo n.º 5
0
 def on_message(self, message):
     for msg in message:
         logger.debug("TOPIC: %s - PARTION: %s -KEY:%s" % (msg.topic, msg.partion, msg.key))
         data = json.loads(msg.value)
         logger.debug("POLL _ID: %s" % data.get("_id"))
         try:
             self.callback(data.get("data"))
         except Exception as e:
             logger.error(traceback.format_exc())
             logger.error(e.args)
             raise e
Exemplo n.º 6
0
    def run(self, df):

        try:
            time_variables = self.find_time_variables(df)
            print(time_variables)
            for v in time_variables:
                df = self.string_to_timestamp(df, v)
                self.update_metadata(v)
            return df

        except Exception as e:
            logger.error(e)
    def read_parquet(self, path):
        """

        :param path:
        :return:
        """
        try:
            self.dataframe = self.spark_session.read.load(path)
            return self.dataframe
        except Exception as e:
            logger.error(e)
            return {"success": False, "message": e}
Exemplo n.º 8
0
 def fetch_columns_containing_url(df):
     """
     Automatically fetch column name contains urls
     :param df: orig dataframe
     :return: return list of columns containing urls
     """
     try:
         col_dict = df.select([funct.col(col).rlike(r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))').alias(col) for col in
                               df.columns]).collect()[0].asDict()
         col_containing_url = [k for k, v in col_dict.items() if v is True]
         return col_containing_url
     except Exception as e:
         logger.error(e)
    def run(self, df):

        try:
            time_variables = self.find_time_variables(df)
            for v in time_variables:
                df = self.make_new_variables(df, v)
                self.update_metadata(v)
            return df

        except Exception as e:
            logger.error(e)
            print(e)
            return (e)
Exemplo n.º 10
0
    def prep(self,
             dataframe=None,
             s3={},
             local="no",
             file_name="",
             file_format="csv",
             output_address="",
             output_format=""):
        """

        :param dataframe:
        :param s3:
        :param local:
        :param file_name:
        :param file_format:
        :param output_address:
        :param output_format:
        :return:
        """

        self.local = local
        self.s3 = s3
        self.file_format = file_format
        self.file_name = file_name
        self.output_address = output_address
        self.output_format = output_format
        """
        convert dataframe into spark dataframe if datarame provided is not none
        """
        try:
            if dataframe != None:
                p = pd.DataFrame()
                if type(dataframe) == type(p):
                    # convert dataframe into spark dataframe
                    self.dataframe = self.spark_session.createDataFrame(
                        dataframe)
                else:
                    self.dataframe = dataframe
            """
            read the file from local or s3 when dataframe passed is none
            """
            if dataframe == None:
                self.dataframe = self.read_as_dataframe()
            """
            time to prepare the recipe
            """
            self.dataframe_output = self.preprocess()

            return self.dataframe_output
        except Exception as e:
            logger.error(e)
Exemplo n.º 11
0
def remove_cols_containing_nan():
    try:

        logger.debug("this is debug")
        df = sql.read.csv("./run/rem_test.csv", inferSchema=True, header=True)

        return_df = Duplication().remove_columns_containing_all_nan_values(df)
        return_df.toPandas().to_csv('./run/rem_test_result.csv')
        print(df.show())
        print("#####################")
        print("resulted_df")
        print(return_df.show())
    except Exception as e:
        logger.error(e)
Exemplo n.º 12
0
def cleaning_test():
    try:
        df = sql.read.csv("./run/column_rem.csv",
                          inferSchema=True,
                          header=True)

        return_df = Duplication().remove_columns_contains_same_value(df)
        return_df.toPandas().to_csv('./run/rem_test.csv')
        print(df.show())
        print("#####################")
        print("resulted_df")
        print(return_df.show())
    except Exception as e:
        logger.error(e)
Exemplo n.º 13
0
    def read_excel(self, path):
        """

        :param path:
        :return:
        """
        try:
            self.dataframe = self.spark_session.read.csv(path,
                                                         inferSchema=True,
                                                         header=True)
            return self.dataframe
        except Exception as e:
            logger.error(e)
            return {"success": False, "message": e}
Exemplo n.º 14
0
def date_cleaning():
    try:
        df = sql.read.csv("./run/testing_dates.csv",
                          inferSchema=True,
                          header=True)
        print(df.columns)
        return_df = DatetimeFormatting().date_cleaning(df, ['dates'])
        return_df.toPandas().to_csv('./run/date_test_res.csv')
        print(df.show())
        print("#####################")
        print("resulted_df")
        print(return_df.show())
    except Exception as e:
        logger.error(e)
Exemplo n.º 15
0
    def read_as_dataframe(self):
        """

        :return:
        """
        try:
            read = ReadFile()
            self.dataframe = read.read(address=self.file_address,
                                       local=self.local,
                                       file_format=self.file_format,
                                       s3=self.s3)
            return True
        except Exception as e:
            logger.error(e)
Exemplo n.º 16
0
    def run(self, df):

        numeric_columns = self.find_numeric_variables_saved_as_string(df)
        df = self.update_variable_types(df, numeric_columns)
        #         self.update_metadata(v)
        return df

        try:
            numeric_columns = self.find_numeric_variables_saved_as_string(df)
            df = self.update_variable_types(df, numeric_columns)
            self.update_metadata(v)
            return df

        except Exception as e:
            logger.error(e)
Exemplo n.º 17
0
 def fetch_columns_containing_datetime(df):
     """
     Automatically detects the column which contains the date values
     :param df: orig dataframe
     :return: list of column name contains the date values
     """
     try:
         col_dict = df.select([
             funct.col(col).rlike(r'(\d+(/|-){1}\d+(/|-){1}\d{2,4})').alias(
                 col) for col in df.columns
         ]).collect()[0].asDict()
         col_containig_url = [k for k, v in col_dict.items() if v is True]
         return col_containig_url
     except Exception as e:
         logger.error(e)
Exemplo n.º 18
0
    def remove_columns_contains_same_value(df):
        """
        remove columns which contains only one kind of value
        :param df: original dataframe containing data
        :return: return dataframe after removing columns
        """
        try:

            col_counts = df.select([(funct.countDistinct(funct.col(col))).alias(col) for col in df.columns]).collect()[
                0].asDict()
            to_drop = [k for k, v in col_counts.items() if v == 1]

            df = df.drop(*to_drop)

            return df
        except Exception as e:
            logger.error(e)
Exemplo n.º 19
0
 def send(self, msg, key=None, partition=None, timestamp_ms=None):
     _id = allocate_uuid()
     msg = {"_id": _id, "data": msg}
     msg = self.__format_msg(msg)
     self.connector()
     try:
         self.client.send(MQ_KAFKA_TOPIC,
                          value=msg,
                          key=key,
                          partition=partition,
                          timestamp_ms=timestamp_ms)
         logger.debug("SEND _ID: %s" % _id)
         return _id
     except Exception as e:
         logger.error(traceback.format_exc())
         logger.error(e.args)
         raise e
Exemplo n.º 20
0
    def date_cleaning(self, df, column_name=[]):
        """
        Converts all the columns containing dates into standard date format
        In a for loop every column values are traverse and udf_date_formatting function is called

        :param df: orig dataframe
        :param column_name: list of column names containing date
        :return: return a new_df containing some new columns with updated date values
        """
        try:
            for i in column_name:
                df = df.withColumn(
                    i + '_new',
                    self.udf_date_formatting()(funct.col(i).cast("String")))
            return df
        except Exception as e:
            logger.error(e)
Exemplo n.º 21
0
    def remove_duplicate_urls(self, df, column_name, base_url=''):
        """
        Orig dataframe is received  with columns containing urls .
        Those columns are cleaned to remove duplication

        :param df: dataframe containing data which need to be cleaned
        :param column_name: list of columns containing urls
        :param base_url: base_url optional
        :return: return dataframe with _new column name append describing cleaned column
        """
        try:
            for i in column_name:
                df = df.withColumn(i + '_new',
                                   self.udf_remove_stop_words(base_url)(
                                       funct.trim(funct.lower(funct.col(i).cast("string")))))

            return df
        except Exception as e:
            logger.error(e)
Exemplo n.º 22
0
    def run(self, df):

        try:
            self.df = df
            variables = self.get_variables_types()

            df = self.train_model_for_categorical_variables()
            df = self.impute_categorical_variables()
            del_variables = self.delete_extra_categorical_variables()

            self.update_metadata(v)

            #             df= self.train_model_for_numerical_variables()
            #             df= self.impute_numerical_variables()
            #             del_variables= self.delete_extra_numerical_variables()

            return df

        except Exception as e:
            logger.error(e)