示例#1
0
    def storeDF(self, df: DataFrame, dfName: str, persistType: str,
                partitions: int, partitionCols: List[str]):
        '''
        Store the input dataframe, read the persisted datafrme and return the new one.
        If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist.
        '''
        if self.__explainDF or \
            "NULL|NONE".index(persistType.toUpperCase()) < 0 :
            self.log.info("Execution pland for building the DF '%s'" %
                          (dfName))
            df.explain()
            self.log.info("\n\n\n")

        saveType = self.__parms["--saveDFAs"] \
            if self.__saveDF and \
               "HIVE|NULL".index(persistType.toUpperCase()) < 0 \
            else \
                persistType.toUpperCase()

        if saveType == "S3" and self.__runEnv == "aws":
            saveType = "HDFS"
            self.log.debug(
                "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'"
            )

        df1 = df if saveType != "HDFS" and \
                    saveType != "HIVE" and \
                    saveType != "S3" \
                 else self.repartitionDF(dataFrame= df, partitions = partitions)

        if saveType == "NULL" or saveType == "NONE":
            return df1
        elif saveType == "HDFS":
            return self.persistExternal(self.__tempHDFS, dfName, df,
                                        partitionCols)
        elif saveType == "S3":
            return self.persistExternal(self.__tempS3, dfName, df,
                                        partitionCols)
        elif saveType == "":
            return self.persist2Hive(dfName, df, partitionCols)
        elif saveType == "CHECK_POINT":
            return df.cache().checkpoint(eager=True)
        else:
            return self.persistLocal(dfName, df, persistType)
示例#2
0
    def storeDF(self, df: DataFrame, dfName: str, persistType: str,
                partitions: int, partitionCols: list[str]):
        '''
        Store the input dataframe, read the persisted datafrme and return the new one.
        If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist.
        '''
        persistTyp = persistType.toUpperCase()
        if self.__explainDF and \
            ( persistTyp not in ['NULL' , 'HIVE']) :
            self.log("\n\n\n")
            self.log("Execution plan for building the DF '%s' is," % (dfName))
            df.explain()
            self.log("\n\n\n")

        saveTyp = self.__saveDF if self.__saveDF and persistTyp not in ['NULL' , 'HIVE'] \
                  else persistTyp

        df1 = df if saveTyp not in ["HDFS","HIVE","S3" ] \
                 else self.repartitionDF(dataFrame= df, partitions = partitions)

        if saveTyp in ["NULL", "NONE"]:
            return df1
        elif saveTyp == "HDFS":
            return self.persistExternal(parentDirURI=self.__tempHDFS,
                                        fileName=dfName,
                                        df=df,
                                        partitionCols=partitionCols)
        elif saveTyp == "S3":
            return self.persistExternal(parentDirURI=self.__tempS3,
                                        fileName=dfName,
                                        df=df,
                                        partitionCols=partitionCols)
        elif saveTyp == "HIVE":
            return self.save2Hive(db=self.workDB,
                                  table=dfName,
                                  df=df,
                                  partitionCols=partitionCols)
        elif saveTyp == "CHECK_POINT":
            return df.cache().checkpoint(eager=True)
        else:
            return self.persistLocal(dfName=dfName,
                                     df=df,
                                     persistType=persistType)