示例#1
0
def join_park_violation_with_centerline(df_park_violation: DataFrame,
                                        df_centerline: DataFrame) -> DataFrame:
    """
    Joining park_violation dataframe and centerline datafrmae based on borocode, street name and house number

    Basic steps:
    1. joined odd house numbers with L_LOW_HN & L_HIGH_HN of centerline data
    2. joined even house numbers with R_LOW_HN & R_HIGH_HN of centerline data
    3. Also other criteria was borocode and street name to join the data

    :param df_park_violation:
    :param df_centerline:
    :return:
    """
    # df_park_violation = df_park_violation.repartition("BOROCODE", "Street Name", "House Number")
    # df_centerline.cache()
    """below steps for even house number"""
    """below steps for odd house number"""
    df_park_violation.cache()
    df_centerline.cache()

    df_park_violation_odd = df_park_violation.filter(F.col("temp") % 2 != 0)
    df_park_violation_even = df_park_violation.filter(F.col("temp") % 2 == 0)
    df_centerline.count()

    df_joined_1 = (df_park_violation_even.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.R_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.R_HIGH_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))

    df_joined_2 = (df_park_violation_odd.alias("park").join(
        df_centerline.alias("centerline").hint("broadcast"),
        ((F.col("Street Name") == F.col("ST_NAME")) |
         (F.col("Street Name") == F.col("FULL_STREE")))
        & (F.col("park.BOROCODE") == F.col("centerline.BOROCODE"))
        & ((F.col("park.House Number") >= F.col("centerline.L_LOW_HN"))
           & (F.col("park.House Number") <= F.col("centerline.L_LOW_HN"))),
    ).select("total_cnt", "year", "PHYSICALID"))
    """returing union of 2 dataframes"""
    return df_joined_1.unionAll(df_joined_2)
    def embed_vector_to_not_matched_words(self, df: DataFrame,
                                          df_vector_filler: DataFrame):
        not_matched_df = df.where(col('word_vector').isNull()).select(
            self.sentence_col_id, 'word')

        df3 = self.assign_alternative_match_word_based_on_lavenshtein(
            not_matched_df, df_vector_filler)

        return df3.alias('base').join(
            df_vector_filler.alias('filler'),
            df3.match == col('filler' + '.' + self.word_col_name),
            how='left').select(
                self.sentence_col_id,
                col('base' + '.' + 'word').alias('word'),
                col('filler' + '.' + 'word_vector').alias('word_vector'))
        def tableMerge(spark:SparkSession, df:DataFrame, settings:dict) -> None:
            """ merges data with existing table - avoid duplicates rows """
            #print(f"\033[1;33mtableMerge - {settings.get('path')}\033[0m") # debug terminal
            if settings.get('key') == None:
                raise ValueError('Provide a key in your settings dict to merge tables')  

            if DeltaTable.isDeltaTable(spark, settings.get('path')) == True:
                spark.sql("SET spark.databricks.delta.schema.autoMerge.enabled = true")
                spark.sql("SET spark.databricks.delta.resolveMergeUpdateStructsByName.enabled = false")
                debugSession(spark)

                dt = DeltaTable.forPath(spark, settings.get('path'))
                dt \
                    .alias("t") \
                    .merge( \
                        df.alias("s"), \
                        f"t.{settings.get('key')} = s.{settings.get('key')}" \
                    ) \
                    .whenNotMatchedInsertAll() \
                    .execute()