Пример #1
0
    def init_spark_with_s3(self, s3):
        """

        :param s3:
        :return:
        """

        success = False
        message = ""

        for i in range(10):

            try:

                self.spark_context = SparkContext().getOrCreate()

                self.hadoop_conf = self.spark_context._jsc.hadoopConfiguration(
                )
                self.hadoop_conf.set(
                    "fs.s3n.impl",
                    "org.apache.hadoop.fs.s3native.NativeS3FileSystem")
                self.hadoop_conf.set("fs.s3n.awsAccessKeyId", s3["ID"])
                self.hadoop_conf.set("fs.s3n.awsSecretAccessKey", s3["key"])

                self.spark_session = SparkSession(self.spark_context)
                success = True
                continue
            except Exception as e:
                logger.error(e)
                message = e
                pass

        return success, message
Пример #2
0
    def read(self, address="", file_format="csv", s3={}):
        """

        :param address:
        :param file_format:
        :param s3:
        :return:
        """
        try:
            status, message = self.init_spark_with_s3(s3)
            if status is False:
                return {"success": status, "message": message}

            self.s3 = s3
            self.address = address

            if file_format == "csv":
                self.dataframe = self.read_csv(address)
            elif file_format == "excel":
                self.dataframe = self.read_excel(address)
            elif file_format == "parquet":
                self.dataframe = self.read_parquet(address)
            else:
                print(
                    "File format " + file_format +
                    " is currently not supported. Please create a feature request on Github"
                )

            return self.dataframe
        except Exception as e:
            logger.error(e)
Пример #3
0
 def read_parquet(self, path):
     try:
         self.dataframe = self.spark_session.read.load('s3n://' +
                                                       self.s3["bucket"] +
                                                       '/' + path)
         return self.dataframe
     except Exception as e:
         logger.error(e)
         return {"success": False, "message": e}
Пример #4
0
 def date_formatting(self, x):
     """
     dateutill library is used to convert the different format of dates into standard format
     :param x: row wise date values
     :return: standard format of date
     """
     try:
         return str(parser.parse(x))
     except Exception as e:
         logger.error(e)
         return str(x)
    def read_parquet(self, path):
        """

        :param path:
        :return:
        """
        try:
            self.dataframe = self.spark_session.read.load(path)
            return self.dataframe
        except Exception as e:
            logger.error(e)
            return {"success": False, "message": e}
    def read_excel(self, path):
        """

        :param path:
        :return:
        """
        try:
            self.dataframe = self.spark_session.read.csv(path, inferSchema=True, header=True)
            return self.dataframe
        except Exception as e:
            logger.error(e)
            return {"success": False, "message": e}