def join_park_violation_with_centerline(df_park_violation: DataFrame, df_centerline: DataFrame) -> DataFrame: """ Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number Basic steps: 1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data 2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data 3. Also other criteria was borocode and street name to join the data :param df_park_violation: :param df_centerline: :return: """ # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number") # df_centerline.cache() """below steps for even house number""" """below steps for odd house number""" df_park_violation.cache() df_centerline.cache() df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0) df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0) df_centerline.count() df_joined_1 = (df_park_violation_even.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))), ).select("total_cnt", "year", "PHYSICALID")) df_joined_2 = (df_park_violation_odd.alias("park").join( df_centerline.alias("centerline").hint("broadcast"), ((F.col("Street Name") == F.col("ST_NAME")) | (F.col("Street Name") == F.col("FULL_STREE"))) & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE")) & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN")) & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))), ).select("total_cnt", "year", "PHYSICALID")) """returing union of 2 dataframes""" return df_joined_1.unionAll(df_joined_2)
def embed_vector_to_not_matched_words(self, df: DataFrame, df_vector_filler: DataFrame): not_matched_df = df.where(col('word_vector').isNull()).select( self.sentence_col_id, 'word') df3 = self.assign_alternative_match_word_based_on_lavenshtein( not_matched_df, df_vector_filler) return df3.alias('base').join( df_vector_filler.alias('filler'), df3.match == col('filler' + '.' + self.word_col_name), how='left').select( self.sentence_col_id, col('base' + '.' + 'word').alias('word'), col('filler' + '.' + 'word_vector').alias('word_vector'))
def tableMerge(spark:SparkSession, df:DataFrame, settings:dict) -> None: """ merges data with existing table - avoid duplicates rows """ #print(f"\033[1;33mtableMerge - {settings.get('path')}\033[0m") # debug terminal if settings.get('key') == None: raise ValueError('Provide a key in your settings dict to merge tables') if DeltaTable.isDeltaTable(spark, settings.get('path')) == True: spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true") spark.sql("SET spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = false") debugSession(spark) dt = DeltaTable.forPath(spark, settings.get('path')) dt \ .alias("t") \ .merge( \ df.alias("s"), \ f"t.{settings.get('key')} = s.{settings.get('key')}" \ ) \ .whenNotMatchedInsertAll() \ .execute()