示例#1
0
    def persistExternal(self,
                        parentDirURI: str,
                        fileName: str,
                        df: DataFrame,
                        partitionCols: List[str] = None,
                        overwrite: bool = True,
                        fileFormat: str = None,
                        **kwargs):

        fullPath = "%s%s"  % (parentDirURI,fileName or "") if parentDirURI.endswith("/") else \
                   "%s/%s" % (parentDirURI,fileName or "")
        fullPath = fullPath.replace("//", "/")
        schma = df.schema()
        fileFormat = fileFormat or self.__fileFmt
        self.write2ExtrFile(fullPath=fullPath,
                            fileFormat=fileFormat,
                            df=df,
                            partitionCols=partitionCols,
                            overwrite=overwrite,
                            **kwargs)
        df.unpersist()
        if fileFormat == "parquet":
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "orc":
            return self.readOrc(uri=fullPath, schema=schma, **kwargs)
        elif fileFormat == "csv":
            return self.readCSV(uri=fullPath, schema=schma, **kwargs)
        else:
            return self.readParquet(uri=fullPath, schema=schma, **kwargs)
示例#2
0
    def persistExternal(self,
                        parentDirURI: str,
                        fileName: str,
                        df: DataFrame,
                        partitionCols: list[str] = None,
                        overwrite: bool = True):
        '''
        Persist the input Dataframe to the external File storage.
        '''
        fullPath = \
               "%s%s" % (parentDirURI,fileName or "").replace("//", "/") \
                if parentDirURI.endswith("/") \
                else \
                "%s/%s" % (parentDirURI,fileName or "").replace("//", "/")
        schma = df.schema()
        write2ExtrFile(self.__fileFmt,
                       path=fullPath,
                       fileName=fileName,
                       df=df,
                       partitionCols=partitionCols,
                       overwrite=overwrite)

        if parentDirURI.startswith("s3://"):
            pass

            #TODO:Yet to Implement
        df.unpersist()
        if self.__fileFmt == 'parquet':
            return self.readParquet(fullPath=fullPath, schma=schma)
        elif self.__fileFmt == 'orc':
            return self.readOrc(fullPath=fullPath, schma=schma)
        elif self.__fileFmt == 'orc':
            return self.readOrc(fullPath=fullPath, schma=schma)
        elif self.__fileFmt == 'csv':
            return self.readCSV(fullPath=fullPath, schma=schma)
        elif self.__fileFmt == 'avro':
            return self.readAvro(fullPath=fullPath, schma=schma)
        else:
            return self.readParquet(fullPath=fullPath, schma=schma)