def join_park_violation_with_centerline(df_park_violation: DataFrame, df_centerline: DataFrame) -> DataFrame: """ Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number Basic steps: 1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data 2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data 3. Also other criteria was borocode and street name to join the data :param df_park_violation: :param df_centerline: :return: """ # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number") # df_centerline.cache() """below steps for even house number""" """below steps for odd house number""" df_park_violation.cache() df_centerline.cache() df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0) df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0) df_centerline.count() df_joined_1 = (df_park_violation_even.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))), ).select("total_cnt", "year", "PHYSICALID")) df_joined_2 = (df_park_violation_odd.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))), ).select("total_cnt", "year", "PHYSICALID")) """returing union of 2 dataframes""" return df_joined_1.unionAll(df_joined_2)
def storeDF(self, df: DataFrame, dfName: str, persistType: str, partitions: int, partitionCols: List[str]): ''' Store the input dataframe, read the persisted datafrme and return the new one. If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist. ''' if self.__explainDF or \ "NULL|NONE".index(persistType.toUpperCase()) < 0 : self.log.info("Execution pland for building the DF '%s'" % (dfName)) df.explain() self.log.info("\n\n\n") saveType = self.__parms["--saveDFAs"] \ if self.__saveDF and \ "HIVE|NULL".index(persistType.toUpperCase()) < 0 \ else \ persistType.toUpperCase() if saveType == "S3" and self.__runEnv == "aws": saveType = "HDFS" self.log.debug( "Resetting the persist type to 'HDFS' as the --runEnv != 'aws'" ) df1 = df if saveType != "HDFS" and \ saveType != "HIVE" and \ saveType != "S3" \ else self.repartitionDF(dataFrame= df, partitions = partitions) if saveType == "NULL" or saveType == "NONE": return df1 elif saveType == "HDFS": return self.persistExternal(self.__tempHDFS, dfName, df, partitionCols) elif saveType == "S3": return self.persistExternal(self.__tempS3, dfName, df, partitionCols) elif saveType == "": return self.persist2Hive(dfName, df, partitionCols) elif saveType == "CHECK_POINT": return df.cache().checkpoint(eager=True) else: return self.persistLocal(dfName, df, persistType)
def storeDF(self, df: DataFrame, dfName: str, persistType: str, partitions: int, partitionCols: list[str]): ''' Store the input dataframe, read the persisted datafrme and return the new one. If Memory/Disk persistance requested, we run take(1) on the datafrme to force persist. ''' persistTyp = persistType.toUpperCase() if self.__explainDF and \ ( persistTyp not in ['NULL' , 'HIVE']) : self.log("\n\n\n") self.log("Execution plan for building the DF '%s' is," % (dfName)) df.explain() self.log("\n\n\n") saveTyp = self.__saveDF if self.__saveDF and persistTyp not in ['NULL' , 'HIVE'] \ else persistTyp df1 = df if saveTyp not in ["HDFS","HIVE","S3" ] \ else self.repartitionDF(dataFrame= df, partitions = partitions) if saveTyp in ["NULL", "NONE"]: return df1 elif saveTyp == "HDFS": return self.persistExternal(parentDirURI=self.__tempHDFS, fileName=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "S3": return self.persistExternal(parentDirURI=self.__tempS3, fileName=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "HIVE": return self.save2Hive(db=self.workDB, table=dfName, df=df, partitionCols=partitionCols) elif saveTyp == "CHECK_POINT": return df.cache().checkpoint(eager=True) else: return self.persistLocal(dfName=dfName, df=df, persistType=persistType)