예제 #1
0
def filter_fiscal_year_and_month(
        df: DataFrame,
        fiscal_year: str,
        month_abbr: str,
        date_fmt: str = 'MM/dd/yyyy',
        filter_column_year: str = 'voucher_creation_date',
        filter_column_month: str = 'shipment_pickup_date') -> DataFrame:
    filtered = (
        df
        .withColumn(
            '_fiscal_year',
            F.coalesce(
                F.year(F.add_months(
                    F.to_date(filter_column_year, date_fmt), 3)),
                F.year(F.add_months(
                    F.to_date(filter_column_month, date_fmt), 3))
            ))
        .withColumn(
            '_month_abbr',
            F.coalesce(
                F.upper(F.date_format(
                    F.to_date(filter_column_year, date_fmt), 'MMM')),
                F.upper(F.date_format(
                    F.to_date(filter_column_month, date_fmt), 'MMM'))
            ))
        .where(F.col('_fiscal_year') == fiscal_year)
        .where(F.col('_month_abbr') == month_abbr)
    )  # yapf: disable
    return filtered
예제 #2
0
    def test_add_months_function(self):
        dt = datetime.date(2021, 12, 27)

        # Note; number in Python gets converted to LongType column;
        # this is not supported by the function, so cast to Integer explicitly
        df = self.spark.createDataFrame([Row(date=dt, add=2)],
                                        "date date, add integer")

        self.assertTrue(
            all(
                df.select(
                    add_months(df.date, df.add) == datetime.date(2022, 2, 27),
                    add_months(df.date, "add") == datetime.date(2022, 2, 27),
                    add_months(df.date, 3) == datetime.date(2022, 3, 27),
                ).first()))
예제 #3
0
def get_date_dimension(spark, year, month, logger):
    """Create the dataframe for the date dimension for this year and month"""
    start_date = datetime.date(int(year), int(month), 1)
    basedates_df =  spark.createDataFrame([(start_date,)], ['start_date'])
    basedates_df = basedates_df.withColumn('end_date',
                                        add_months(basedates_df.start_date, 1))
    end_date = basedates_df.collect()[0][1]
    numdays = (end_date - start_date).days

    datelist = [start_date + datetime.timedelta(days=x) for x in range(numdays)]
    datesofmonth_df =  spark.createDataFrame(datelist, DateType())\
                            .withColumnRenamed('value', 'date')

    date_df = datesofmonth_df.withColumn('str_weekday',
                                    date_format(datesofmonth_df.date, 'EEEE'))\
                            .selectExpr("date as date_id",
                                        "day(date) as day",
                                     "weekofyear(date) as week",
                                     "month(date) as month",
                                     "year(date) as year",
                                     "str_weekday as weekday")\
                            .distinct()\
                            .orderBy('date_id')
    date_df.persist()
    logger.info("date_df row count: {}".format(date_df.count()))
    return date_df
예제 #4
0
def getGroupedPointOwners():
    if environment == "local":
        pass
    elif environment == "cluster":
        pass
    else:
        raise AssertionError(
            "Bad environment variable (environment = {}). Should be local or cluster"
            .format(environment))

    points_df = spark.read.format("orc").load(files_directory)

    # df modification
    # points_df = points_df.withColumn("qty", points_df["qty"].cast(DoubleType()))
    # points_df = points_df.withColumn("period_full_date",
    #                                  from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss'))
    points_df = points_df.withColumn(
        "period",
        from_unixtime(points_df["period"] / 1000, 'yyyy-MM-dd hh:mm:ss'))
    #
    # print(points_df.printSchema)
    # points_df.show(10)
    #
    # points_stats = points_df \
    #     .groupBy(["period_year_month", "organisationid", "customerid", "typeid"]) \
    #     .agg(_sum("qty").alias("total_qty")).orderBy(desc("period_year_month"))

    points_df = points_df.withColumn("qty",
                                     points_df["qty"].cast(DoubleType()))
    points_df = points_df.withColumn('month', trunc(points_df['period'], 'MM'))

    points_df = points_df.groupby(
        ['organisationid', 'customerid', 'typeid', 'month']).sum('qty')

    points_df = points_df.withColumn(
        "cumulativeSum",
        _sum('sum(qty)').over(
            Window.partitionBy(['organisationid', 'customerid',
                                'typeid']).orderBy('month')))

    points_df = points_df.withColumn('aggdate',
                                     add_months(points_df['month'], 1))
    points_df = points_df.withColumn('aggdate_ts',
                                     to_timestamp(points_df['aggdate']))
    points_df = points_df.withColumn('aggdate_date',
                                     points_df['month'].cast(DateType()))
    points_df = points_df.withColumn("qty", points_df["cumulativeSum"])

    points_df = points_df.drop('cumulativeSum')
    points_df = points_df.drop('sum(qty)')
    points_df = points_df.drop('month')

    points_df.show(100)

    return points_df
예제 #5
0
def add_fiscal_year_and_month_abbr(
        df,
        date_fmt: str = 'yyyy/MM/dd',
        filter_column_year: str = 'voucher_creation_date',
        filter_column_month: str = 'shipment_pickup_date') -> DataFrame:
    expr_mapping = {
        '_fiscal_year': (F.coalesce(
            F.year(F.add_months(F.to_date(filter_column_year, date_fmt), 3)),
            F.year(F.add_months(F.to_date(filter_column_month, date_fmt),
                                3)))),
        '_month_abbr': (F.coalesce(
            F.upper(
                F.date_format(F.to_date(filter_column_year, date_fmt), 'MMM')),
            F.upper(
                F.date_format(F.to_date(filter_column_month, date_fmt),
                              'MMM'))))
    }
    select_expr = build_col_expr(expr_mapping)
    transformed = df.select(F.expr('*'), *select_expr)
    return transformed
예제 #6
0
def preprocess_educations(df, output_path):
    """
    简历教育经历信息预处理
    :param df: 教育经历数据,DataFrame数据类型
    :param output_path: 输出文件路径
    :return: 
    """

    # 预处理后输出字段
    output_columns = [
        "resume_id", "school_name", "school_area", "major", "category",
        "subcategory", "edu_start_date", "edu_end_date", "edu_start_year",
        "edu_end_year", "degree", "edu_index"
    ]

    # 归一化处理学校名,学历
    df = df.withColumn("school_name",
                       F.udf(normalization_school_name)(df.school))
    df = df.withColumn("school_area", F.udf(mark_school_area)(df.school_name))
    df = df.withColumn("major", F.udf(normalization_major)(df.major))
    df = df.withColumn("degree", F.udf(normalization_degree)(df.degree))
    # df = df.withColumn("major_category", F.udf(get_major_level2)(df.major, df.degree))
    df = df.withColumn(
        "subjects",
        F.udf(normalized_major_tuple,
              ArrayType(ArrayType(StringType())))(df.major, df.degree))
    df = df.withColumn("subjects", F.explode("subjects")) \
        .withColumn("category", F.col("subjects")[0]) \
        .withColumn("subcategory", F.col("subjects")[1]) \
        .withColumn("major", F.col("subjects")[2])
    # 标准化日期格式
    df = df.withColumn("start_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.start_time))
    df = df.withColumn("end_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.end_time))
    df = df.withColumn("edu_start_date", F.date_format(df.start_time,
                                                       "yyy-MM"))
    df = df.withColumn("edu_end_date", F.date_format(df.end_time, "yyy-MM"))
    df = df.withColumn(
        "edu_end_date",
        F.when(df.edu_end_date.isNotNull(), df.edu_end_date).otherwise(
            F.date_format(F.add_months(df.edu_start_date, 48), "yyy-MM")))
    df = df.withColumn("edu_start_year", F.year(df.start_time))
    df = df.withColumn("edu_end_year", F.year(df.end_time))
    df = df.withColumn("edu_index", df.index)

    # 过滤条件
    df = df.dropna(how="all", subset=["school_name", "major", "degree"])

    # 将简历基本信息预处理后的结果保存至指定文件
    df.select(output_columns).write.mode('overwrite').save(output_path)
예제 #7
0
def cad_to_usd_rate(currency_exchange_rates: DataFrame, fiscal_year: str,
                    month_abbr: str) -> float:
    """Currently returns latest exchange rate for the given month."""
    filtered = (
        currency_exchange_rates
        .where(F.col('currency_code_from') == 'CAD')
        .where(F.col('currency_code_to') == 'USD')
        .where(F.year(F.add_months(F.to_date(
            'effective_date', 'yyyyMMdd'), 3)) == fiscal_year)
        .where(F.upper(F.date_format(F.to_date(
            'effective_date', 'yyyyMMdd'), 'MMM')) == month_abbr)
        .sort('effective_date', ascending=False)
    )  # yapf: disable
    return filtered.first().conversion_rate_multiplier
예제 #8
0
def create_scoring_output(pargs, params):
    """
    Generate final renewal scoring CSV file for MD dashboard

    :inputs: renewal_scoring_output
    :outputs: md_prediction_1_firstyr_renewal
    """

    # Read and refine scoring output table
    model_result = spark.read.parquet(
        data_paths["renewal_scoring_output"].format(run_mode=run['run_mode'],
                                                    run_id=run['run_id']))
    model_result = model_result.dropDuplicates(['imc_no'])
    output_tbl = model_result.groupBy(
        ["n_signup_year",
         "n_signup_month"]).count().withColumnRenamed('count',
                                                      'num_population')

    # Create new fields for MD dashboard
    output_prob = model_result.groupBy([
        "n_signup_year", "n_signup_month"
    ]).sum('renewal_label').withColumnRenamed('sum(renewal_label)',
                                              'num_renew')
    output_tbl = output_tbl.join(output_prob,
                                 ["n_signup_year", "n_signup_month"])
    output_tbl = output_tbl.withColumn(
        'renewal_pct', output_tbl['num_renew'] / output_tbl['num_population'])
    output_tbl = output_tbl.withColumn('signup_month', F.concat(F.col("n_signup_year"), F.lit(""), F.col("n_signup_month"))) \
        .withColumn('signup_month', F.to_date(F.col('signup_month'), 'yyyyMM')).drop(*["n_signup_year", "n_signup_month"]) \
        .withColumn('month', F.add_months('signup_month', 15))
    output_tbl = output_tbl.withColumn(
        'signup_month',
        F.date_format(F.col("signup_month"), "yyyyMM")).withColumn(
            'month', F.date_format(F.col("month"),
                                   "yyyyMM")).orderBy('signup_month')

    # Write final table
    output_tbl.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite') \
        .csv(data_paths['md_prediction_1_firstyr_renewal'].format(run_mode=run['run_mode'], run_id=run['run_id']), header='true')
예제 #9
0
    "make_timestamp(YEAR, MONTH, DAY, HOUR, MINUTE, SECOND) as THE_TIMESTAMP")
ts.show(truncate=False)

# COMMAND ----------

# Add 1 month to the next dt column
df = spark.createDataFrame([('2015-04-08', )], ['dt'])
# Expected:
# +----------+
# |next_month|
# +----------+
# |2015-05-08|
# +----------+

# Answer
df.select(F.add_months(df.dt, 1).alias('next_month')).show()

# COMMAND ----------

# Get the current date and current timestamp:
# Expected like this:
# +--------------+-----------------------+
# |current_date()|current_timestamp()    |
# +--------------+-----------------------+
# |2021-01-24    |2021-01-24 22:27:35.045|
# +--------------+-----------------------+

# Answer
df.select(F.current_date(), F.current_timestamp()).show(truncate=False)

# COMMAND ----------
예제 #10
0
    # .where(f"date=='{date}'")
    #Credit to the professor, I leverage this piece of code from class


    def find_median(values_list):
        try:
            median = np.median(values_list)
            return round(float(median), 2)
        except Exception:
            return None

    categories = [
        "big_box_grocers", "convenience_stores", "drinking_places",
        "full_service_restaurants", "limited_service_restaurants",
        "pharmacies_and_drug_stores", "snack_and_bakeries",
        "specialty_food_stores", "supermarkets_except_convenience_stores"
    ]

    for c in categories:
        df.join(filteredCorePlaces, ["placekey"], "inner").groupBy("date","file_name")\
            .agg(F.collect_list("visits").alias('visit'), F.round(F.stddev("visits")).cast("integer").alias('std'))\
            .withColumn("median", median_finder('visit'))\
            .withColumn("low", when(F.col("std") > F.col("median"), 0).otherwise(F.col("median") - (F.col("std"))))\
            .withColumn("high", F.col("median") + F.col("std"))\
            .withColumn("year", F.year("date"))\
            .withColumn("project_date", F.add_months(F.col("date"), 12))\
            .sort(F.col("year"), F.col("project_date"))\
            .where((F.col("year").isin(2019, 2020)) & (F.col("file_name") == c))\
            .select(F.col("year"), F.col("project_date").alias('date'), F.col("median"),F.col("low"),F.col("high"))\
            .coalesce(1).write.mode("overwrite").option("header",True).format("csv").save("/{}/{}".format(OUTPUT_PREFIX, c))
예제 #11
0
      option("header", "true").option(
          "path", os.path.join(DATA_DIRECTORY, "flights",
                               "departuredelays.csv")).load())

# %%
df.show(5, False)

# %%
df = df.withColumn("date_iso", F.to_date(F.col("date"), "MMddHHmm"))

# %%
df.show(50, False)

# %%
# Assume the data are for 2019
df = df.withColumn("date_iso", F.add_months(F.col("date_iso"), 12 * 49))

# %%
df.show(50, False)

# %%
df.createOrReplaceTempView("departuredelays")

# %%
# subquery
spark.sql("""
SELECT date_iso, delay, origin, destination
FROM departuredelays
WHERE delay > 120
    AND ORIGIN = 'SFO'
    AND DESTINATION = 'ORD'
예제 #12
0
def create_migration_monthly_target_variables(pargs, params):
    """
    This function creates bns migration target variables table.
    Target variable: bns_migration_label, contains values (UP, SAME, DOWN)

    :inputs: behaviors_combined_data
    :outputs: bns_migration_3m_outcomes or bns_migration_3m_outcomes
    """

    # Load data
    behaviors_combined_data = sqlContext.read.parquet(
        data_paths['behaviors_combined_data'].format(run_mode=run['run_mode'],
                                                     run_id=run['run_id']))

    # Create time realted columns and adjust columns type. Generate temporary table for SQL queries
    behaviors_combined_data = behaviors_combined_data.withColumn(
        'mo_yr_key_no', F.to_date(F.col('mo_yr_key_no_str'), 'yyyyMM'))
    behaviors_combined_data = behaviors_combined_data.withColumn(
        'calendar_year', year(F.col('mo_yr_key_no')))
    behaviors_combined_data = behaviors_combined_data.withColumn('imc_months_after_signup', behaviors_combined_data['imc_months_after_signup'].cast('int')) \
        .withColumn('bns_pct', behaviors_combined_data['bns_pct'].cast('int'))
    behaviors_combined_data.createOrReplaceTempView("behaviors_combined")

    # Set parameters for 3 months or 6 months tables
    if configs['target_3m']:
        monthPara = "3m"
        monthNum = "3"
        monthLag = '2'
    else:
        monthPara = "6m"
        monthNum = "6"
        monthLag = '5'

    # Processes of generating bns migration target table in Spark SQL
    bns_migration_eda_step_0 = sqlContext.sql(
        queries.bns_migration_eda_step0_sql.format(
            monthLag=monthLag,
            monthPara=monthPara,
            cntry_list=','.join(str(x) for x in run['cntry_key_no'])))
    bns_migration_eda_step_0 = bns_migration_eda_step_0.withColumn(
        "mo_yr_key_no_" + monthPara + "_pre",
        F.add_months('mo_yr_key_no', -int(monthNum)))
    bns_migration_eda_step_0.createOrReplaceTempView(
        "bns_migration_eda_step_0")

    bns_migration_eda_step_1 = sqlContext.sql(
        queries.bns_migration_eda_step1_sql.format(monthPara=monthPara))
    bns_migration_eda_step_1.createOrReplaceTempView(
        "bns_migration_eda_step_1")

    bns_migration_monthly_outcomes = sqlContext.sql(
        queries.bns_migration_outcomes_sql.format(monthLag=monthLag,
                                                  monthPara=monthPara))
    bns_migration_monthly_outcomes = bns_migration_monthly_outcomes.withColumn(
        "original_back_" + monthNum,
        F.add_months('mo_yr_key_no', -int(monthNum)))

    # Write final table
    writePath = data_paths['bns_migration_{monthPara}_outcomes'.format(
        monthPara=monthPara)]
    bns_migration_monthly_outcomes.write.mode('overwrite').option("maxRecordsPerFile", configs["maxRecordsPerFile"]) \
        .parquet(writePath.format(run_mode=run['run_mode'], run_id=run['run_id']))
예제 #13
0
#list_1 = ["col_1", "col_2"]
#df_1 = spark.createDataFrame(list_1, StringType())
#df_1.select(substring_index("value", "_", 1)).show()

# COMMAND ----------

#Date
from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit
orders_new_col.select(
    current_timestamp(), current_date(),
    date_format(current_timestamp(), "yyyy-MM-dd a hh:HH:mm").alias("dat"),
    dayofyear(current_timestamp()).alias("dayofyear_val"),
    month(current_timestamp()).alias("mon"),
    date_sub(current_timestamp(), 1).alias("date_d")).select(
        datediff(current_timestamp(), "date_d")).show(1, truncate=False)
orders_new_col.select(add_months(
    current_timestamp(), -1).alias("prev_mon")).select(
        months_between(current_timestamp(),
                       "prev_mon").alias("no_of_mon")).select(
                           next_day(current_timestamp(), "Mon")).select(
                               last_day(current_timestamp())).show(1)
orders_new_col.select(date_trunc('year', lit('2020-04-01'))).show(1)

# COMMAND ----------

from pyspark.sql.functions import concat_ws
#orders_new_col.printSchema()
#spark.sql("create database if not exists newdb")
#spark.sql("show databases").show()
ordersText = orders_new_col.select(
    concat_ws("~", "order_id", "order_date", "order_customer_id",
              "order_status").alias("concal_col"))
예제 #14
0
    def process_history_df_mom(self, df):
        """
        Process function for history data, generate result dataframe
        that contains week, number of create events
        """
        # There are two versions of API for CreateEvent of repository:
        # - One is        col("payload")['object'] == 'repository'
        # - Another is    col("payload")['ref_type'] == 'repository'
        # try:
        df_columns = df.columns
        df_first_record = df.first()
        # keyword = 'object' if 'object' in df_first_record['payload'] else 'ref_type'

        num_create_events_df = \
            df \
            .filter((col('payload')['ref_type'] == 'repository') | (col('payload')['object'] == 'repository')) \
            .filter((col('type') == 'CreateEvent') | (col('type') == 'Event'))

        # count the number of create events happened in one week (group by week)
        num_create_events_by_month_df = num_create_events_df.groupby(
            date_trunc('month',
                       df.created_at).alias('month_created_at')).count()

        # calculate the grawth rate of that day compare to last week
        # dulicated two dataframes, for each day in the first dataframe
        # find the number fo create events in the second dataframe
        # of a day that is 7 days before the day in the first dataframe
        # [df1] 2015-01-07 -> [df2] 2015-01-01 (7 days)
        num_create_events_by_month_df_1 = num_create_events_by_month_df.alias(
            'num_create_events_by_month_df_1')

        num_create_events_by_month_df_1 = \
            num_create_events_by_month_df_1 \
            .select(
                col('month_created_at').alias('month_created_at_1'),
                col('count').alias('count_1'))

        num_create_events_by_month_df_2 = num_create_events_by_month_df.alias(
            'num_create_events_by_month_df_2')

        num_create_events_by_month_df_2 = \
            num_create_events_by_month_df_2 \
            .select(
                col('month_created_at').alias('month_created_at_2'),
                col('count').alias('count_2'))

        joined_num_create_events_df = \
            num_create_events_by_month_df_1 \
            .withColumn(
                'last_week_month_created_at',
                date_trunc(
                    'month',
                    add_months(num_create_events_by_month_df_1.month_created_at_1, -1))) \
            .join(
                num_create_events_by_month_df_2,
                col('last_week_month_created_at')
                == col('month_created_at_2'),
                how='left_outer')

        joined_num_create_events_df.show()

        joined_num_create_events_df = joined_num_create_events_df.withColumn(
            'count_2', coalesce('count_2', 'count_1'))

        num_create_events_with_growth_rate_df = \
            joined_num_create_events_df \
            .withColumn(
                'monthly_increase_rate',
                ((joined_num_create_events_df.count_1 - joined_num_create_events_df.count_2) / joined_num_create_events_df.count_2)
            ) \
            .select(
                'month_created_at_1',
                'count_1',
                'monthly_increase_rate')

        num_create_events_with_growth_rate_df.show()

        return num_create_events_with_growth_rate_df
예제 #15
0
print("JOB STARTED...")
car_sales     = spark.sql("SELECT * FROM sales.car_sales")
customer_data = spark.sql("SELECT * FROM marketing.customer_data")
car_installs  = spark.sql("SELECT * FROM factory.car_installs")
factory_data  = spark.sql("SELECT * FROM factory.experimental_motors")
geo_data      = spark.sql("SELECT postalcode as zip, latitude, longitude FROM marketing.geo_data_xref")
print("\tREAD TABLE(S) COMPLETED")



#---------------------------------------------------
#                  APPLY FILTERS
# - Remove under aged drivers (less than 16 yrs old)
#---------------------------------------------------
before = customer_data.count()
customer_data = customer_data.filter(col('birthdate') <= F.add_months(F.current_date(),-192))
after = customer_data.count()
print(f"\tFILTER DATA (CUSTOMER_DATA): Before({before}), After ({after}), Difference ({after - before}) rows")



#---------------------------------------------------
#             JOIN DATA INTO ONE TABLE
#---------------------------------------------------
# SQL way to do things
salesandcustomers_sql = "SELECT customers.*, sales.sale_date, sales.saleprice, sales.model, sales.VIN \
                            FROM sales.car_sales sales JOIN marketing.customer_data customers \
                             ON sales.customer_id = customers.customer_id "
tempTable = spark.sql(salesandcustomers_sql)
if (_DEBUG_):
    print("\tTABLE: CAR_SALES")
예제 #16
0
 @author rambabu.posa
"""
from pyspark.sql import SparkSession
from pyspark.sql import (functions as F)
import os

current_dir = os.path.dirname(__file__)
relative_path = "../../resources/data/sparkInActionData/dates.csv"
absolute_file_path = os.path.join(current_dir, relative_path)

# Creates a session on a local master
spark = SparkSession.builder.appName("add_months function") \
    .master("local[*]").getOrCreate()

df = spark.read.format("csv") \
    .option("header", True) \
    .option("imferSchema", True) \
    .load(absolute_file_path)

df = df.withColumn("add_months+2", F.add_months(F.col("date_time"), 2))
# As of now, no support for this operation in PySpark API
# df = df.withColumn("add_months+val", F.add_months(F.col("date_time"), F.col("val")))

df.show(5, False)
df.printSchema()

df.show(5)

spark.stop()
예제 #17
0
# #一个单位每天报警数量
# unit_day_alarm_count=unit_hour_alarm_count.withColumn('alarm_time',date_format(unit_hour_alarm_count.alarm_time, 'yyyy-MM-dd'))\
#     .groupby('unit_id','dept_id','alarm_time')\
#     .sum('count')\
#     .withColumnRenamed("sum(count)","count")\
#     .sort('count',ascending=False)
# unit_day_alarm_count.show()

# 用电采集
new_df = files['dev_id', 'unit_id', 'dept_id', 'voltage', 'current',
               'current_power', 'frequency', 'factor', 'current_power',
               'max_power', 'reactive_power', 'apparent_power',
               'temperature_value', 'leakage_value', 'alarm_time']

new_df = new_df.filter(
    new_df['alarm_time'] > add_months(current_date(), -1)).sort('alarm_time')
new_df.show()

max_count = new_df.groupby('dev_id').agg({
    'voltage': 'max',
    'current': 'max',
    'current_power': 'max',
    'temperature_value': 'max'
})
max_count.show()
min_count=new_df.groupby('dev_id').agg({'voltage':'min','current':'min','current_power':'min','temperature_value':'min'})\
    .withColumnRenamed("dev_id","dev_id2")
min_count.show()

data_count=max_count.join(min_count['min(current)','min(voltage)','min(temperature_value)','min(current_power)','dev_id2']
                          ,(min_count.dev_id2==max_count.dev_id)
예제 #18
0
def vendor_subbusiness_transactions_table(tables,
                                          transactions_date,
                                          graph_spec):
    """
    Return a dataframe that contains vendor, voucher id, sub-business unit
    for a given transactions date.

    :param tables: dictionary that maps (table_name, src_name, dst_name)
                   or (table_name, node_name) to the appropriate DataFrame.
                   - "table_name" is the name of the table.
                   - "src_name"/"dst_name" are the `safe_name` of the
                       `source_column`/`target_column` for the edge list.
                   - "node_name" is the `safe_name` of the `index_column` for
                       node list.
    :type tables: dict
    :param transactions_date: date of the vouchers for checking
    :type transactions_date: str
    :param graph_spec: graph specification
    :type graph_spec: fncore.utils.graph_specification.GraphSpec
    :return: a dataframe that contains vendor, voucher id, sub-business unit
        for a given transactions date. The schema is as follows:
            | |-- vendor: str
            | |-- subbusiness_unit: str
            | |-- voucher_id: str
            | |-- gross_amt: decimal(27,3)
            | |-- invoice_dt: date
    :rtype: pyspark.sql.DataFrame
    """
    table_name = 'voucher sub_business_unit-entity edge'
    edge_list = graph_spec.name_to_edge_list[table_name]

    voucher_id_col_name = edge_list.index_column.safe_name.lower()
    gross_amt_col_name = (
        edge_list
        .name_to_metadata_column['GROSS_AMT_BSE']
        .safe_name
    )
    invoice_dt_col_name = (
        edge_list
        .name_to_metadata_column['INVOICE_DT']
        .safe_name
    )
    account_code_col_name = (
        edge_list
        .name_to_metadata_column['ACCOUNT']
        .safe_name
    )
    fiscal_year_col_name = (
        edge_list
        .name_to_metadata_column['ACCOUNTING_DATE']
        .safe_name
    )

    # 1. Get vendor to subbusiness unit relations
    # subbusiness_vendor: (source) subbusiness unit, (target) vendor
    # (metadata) voucher_id, gross_amt, invoice_dt, fiscal_year
    subbusiness_vendor = _df_col_names_lower(
        _get_dataframe_from_tables(tables, table_name, graph_spec)
    )

    vendor_subbusiness_transactions = (
        subbusiness_vendor
        .select(
            col('_canonical_id_source').alias('subbusiness_unit'),
            col('_canonical_id_target').alias('vendor'),
            col(voucher_id_col_name).alias('voucher_id'),
            col(account_code_col_name).alias('account_code'),
            col(gross_amt_col_name).alias('gross_amt'),
            to_date(col(invoice_dt_col_name)).alias('invoice_dt'),
            year(add_months(to_date(col(fiscal_year_col_name)), -3)).alias('fiscal_year'))
        .filter("invoice_dt='{}'".format(transactions_date))
        # Remove account codes starting with 1 which are EOM payments
        .filter("substr(account_code, 0, 1) != '1'")
    )

    # 2. Get vendor to vendor_name
    vendor = vendor_table(tables, graph_spec)

    # 3. Join vendor with vendor_subbusiness_transactions
    vendor_subbusiness_transactions = vendor_subbusiness_transactions.join(vendor, on='vendor')

    return vendor_subbusiness_transactions
예제 #19
0
output_dir = path1

glueContext = GlueContext(SparkContext.getOrCreate())

#Get DynamicFrames fron GLue Catalog
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "c360view_stage", table_name = "mf_transactions_pqt", transformation_ctx = "datasource0")
datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "c360view_stage", table_name = "gbank_pqt", transformation_ctx = "datasource1")

#Convert to Dataframes
df_transactions = datasource0.toDF()
df_dispositions = datasource1.toDF()

df_dispositions = df_dispositions.drop("type")

#Filter and grouping by time windows (last 3 and 6 months) and Customer + Type of transaction Aggregation
df_trans_acc_aggr_l3m = df_transactions.filter(F.col("date") >= F.add_months(F.current_date(), -3)).groupBy("account_id","type").agg(F.avg('amount'), F.count('amount'))
df_trans_acc_aggr_l6m = df_transactions.filter(F.col("date") >= F.add_months(F.current_date(), -6)).groupBy("account_id","type").agg(F.avg('amount'), F.count('amount'))

#Join DF with customer data
df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.join(df_dispositions,'account_id' , how="inner")
df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.join(df_dispositions,'account_id' , how="inner")

#Renaming columns
df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.withColumnRenamed("avg(amount)","amount_avg").withColumnRenamed("count(amount)","amount_count")
df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.withColumnRenamed("avg(amount)","amount_avg").withColumnRenamed("count(amount)","amount_count")

#Aggregations 2
df_trans_acc_aggr_l3m = df_trans_acc_aggr_l3m.withColumnRenamed("round(avg(amount_avg), 0)","amount_avg_trans").withColumnRenamed("sum(amount_count)","count_trans")
df_trans_acc_aggr_l6m = df_trans_acc_aggr_l6m.withColumnRenamed("round(avg(amount_avg), 0)","amount_avg_trans").withColumnRenamed("sum(amount_count)","count_trans")

#Date time partition columns generation