예제 #1
0
def toProcessTransData():
    df1=rawdata[0].withColumn('yymmref',F.col("year")*100+F.lit(7)) \
     .withColumn("ed_model_id",F.col("ed_model_id").cast(T.StringType())) \
     .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \
     .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \
     .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \
     .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \
     .withColumn("adate",F.to_date(F.concat_ws("-","saleyear","salemonth",F.lit(1)))) \
     .withColumn("fdate",F.lit(fdate)) \
     .withColumn("age",F.months_between(F.col("adate"),F.col('idate'))+1) \
     .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\
     .withColumn("agef",F.months_between(F.col("fdate"),F.col('idate'))+1) \
     .withColumn("agef",F.when(F.col('agef')<1,1).otherwise(F.col('agef')))\
     .withColumn("pdate",F.expr("add_months(fdate,0)")) \
     .withColumn("tdate",F.expr("add_months(fdate,-3)")) \
     .withColumn("syrmmt", F.year('tdate')*100+F.month('tdate')) \
     .withColumn("psaleyear", F.year('pdate')) \
     .withColumn("psalemonth", F.month('pdate'))\
     .cache()
    extraagelist = df1.filter("age<0  and trans_count>=3").groupBy(
        'year', 'make', 'model').agg(F.min(F.col("syrmm")).alias('age1'))
    df1 = df1.filter("age>0")
    #extraagelist%.write.format("delta").option("overwriteSchema", "true").mode("overwrite").saveAsTable("stats.nn_shortterm_extraagelist")
    print(extraagelist.count())
    return df1
예제 #2
0
def join_df(df1, df2):
    df_joined = df1.join(df2, "title")\
        .withColumn("active time",
                    f.round(months_between(col("last_edit_date"), col("creation_date_of_article")), -1)) \
        .withColumn("time_since_last_edit",
                    f.round(months_between(to_timestamp(f.lit("2018-05-21")), col("last_edit_date"))))
    return df_joined
예제 #3
0
def flat_work_year(df):
    # 最近5年毕业的人才
    # df = df.filter(df.edu_end_year >= CURRENT_YEAR - 5)
    # 当前工作的最小薪资必须大于MIN_SALARY, 且区间最大值大于最小值
    df = df.filter((df.salary_min > MIN_SALARY)).filter(df.salary_max >= df.salary_min) \
        .filter(df.avg_salary.isNotNull())
    # 计算毕业到现在的工作时长
    df = df.withColumn("work_month_min",
                       F.months_between(df.work_start_date, df.edu_end_date))
    df = df.withColumn("work_month_max",
                       F.months_between(df.work_end_date, df.edu_end_date))
    df = df.withColumn("work_month_duration",
                       F.months_between(df.work_end_date, df.work_start_date))
    # 将工作月份转换成年数
    df = df.withColumn("work_year_min",
                       F.udf(get_work_year, IntegerType())(df.work_month_min))
    df = df.withColumn("work_year_max",
                       F.udf(get_work_year, IntegerType())(df.work_month_max))
    # 第一份工作入职时间在毕业5年内(0-1.5年都归为1年),工作时长大于0且小于10年
    df = df.filter(df.work_year_min > 0).filter(df.work_year_min <= 5) \
        .filter(df.work_month_duration >= 0).filter(df.work_month_duration < 120)
    # 将薪资摊平到工作年限上
    df = df.withColumn(
        "work_years",
        F.udf(flat_work_year_salary,
              ArrayType(work_year_schema))(df.work_year_min, df.work_year_max,
                                           df.salary_min, df.salary_max))
    # 工作年限对应薪资
    df = df.withColumn("work_years", F.explode(df.work_years))
    df = df.withColumn("work_year", F.col("work_years")["work_year"]) \
        .withColumn("avg_salary", F.col("work_years")["salary"])
    return df
예제 #4
0
def get_filtered_by_month(data: DataFrame) -> DataFrame:
    """
    Method transforms periods "start_date - end_date" to year and month number

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-01-01|2018-01-09|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|      year|     month|
    +---+----------+----------+
    |  5|      2018|         1|
    +---+----------+----------+
    """

    transformed_data = data \
        .withColumn("start_date", F.trunc("start_date", "month")) \
        .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \
        .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", F.posexplode("repeat").alias("date", "val")) \
        .withColumn("month", F.expr("add_months(datab, date)")) \
        .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \
        .dropDuplicates()

    return transformed_data
예제 #5
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
예제 #6
0
    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(
                    date_format(
                        unix_timestamp(column, dateFormat).cast("timestamp"),
                        Format), current_date()) / 12), 4).alias(nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self
예제 #7
0
def creation_date_of_article(df):
    columns_to_drop = ['redirect', 'ns', 'revision', 'date']
    df = df.withColumn("author", df["revision"].getItem(0).contributor.username)\
        .withColumn("authorID", df["revision"].getItem(0).contributor.id)\
        .withColumn("date", df["revision"].getItem(0).timestamp)\
        .withColumn("creation_date_of_article", from_unixtime('date', 'yyyy-MM-dd HH:mm:ss').cast("timestamp"))\
        .withColumn("time_since_creation",
                    f.round(months_between(col("creation_date_of_article"), to_timestamp(f.lit("2001-01-15")))))
    df_res = df.drop(*columns_to_drop)
    return df_res
예제 #8
0
def postgraduate(profile_df, education_df, work_df, mysql_url):
    """
    留学、考研、就业分析
    :param profile_df: 简历基本信息
    :param education_df: 简历教育经历
    :param work_df: 简历工作经历
    :param mysql_url: 数据库路径
    :return: 
    """
    education_df = education_df.filter(education_df.school_name.isNotNull()) \
        .filter(education_df.major.isNotNull()) \
        .filter(education_df.degree.isNotNull())
    education_df = education_df.select("resume_id", "school_name",
                                       "school_area", "major", "degree",
                                       "edu_index", "edu_end_date")
    education_df = education_df.groupby("resume_id").apply(cal_postgraduate)
    education_df = education_df.filter(
        education_df.postgraduate.isNotNull()).filter(
            education_df.flag.isNull())
    education_df.persist()

    # 计算每份工作平均薪资
    work_df = work_df.withColumn(
        "avg_salary",
        F.udf(cal_avg_salary, FloatType())(work_df.salary_min,
                                           work_df.salary_max))
    work_df = work_df.filter(work_df.avg_salary > MIN_SALARY)

    # # 过滤掉只有简历概要信息的所有数据(工作经历分数大于等于2)
    # wc_df = work_df.filter(work_df.work_index == 2).select("resume_id")
    # edu_df = wc_df.join(education_df, "resume_id")

    # 组合教育经历和工作经历
    df = education_df.join(work_df, "resume_id")
    df = df.withColumn("work_month",
                       F.months_between(df.work_end_date, df.edu_end_date))
    df = df.withColumn("work_year",
                       F.udf(get_years, IntegerType())(df.work_month))
    df = df.filter(df.work_year >= 0).filter(df.work_year <= 40)

    # 分析结果表
    result_tables = dict()
    result_tables[
        "school__major__postgraduate__ratio__v1"] = postgraduate_ratio(
            education_df)
    result_tables[
        "school__major__postgraduate__work_year__v1"] = postgraduate_work_year(
            df)
    # 删除持续化数据
    education_df.unpersist()
    # 将专业相关的分析结果写入数据库
    write_mysql(mysql_url, result_tables)
예제 #9
0
        def _years_between(_new_col_name, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                _new_col_name)
예제 #10
0
def combine_resume(profile_path, education_path, work_path, output_path):
    # spark 环境初始化
    conf = SparkConf().set("spark.ui.port", "44040")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # 加载简历基本信息,教育经历,工作经历
    profile_df = sqlContext.read.parquet(profile_path)
    education_df = sqlContext.read.parquet(education_path)
    work_df = sqlContext.read.parquet(work_path)
    df = profile_df.join(education_df, "resume_id").join(work_df, "resume_id")
    # 计算单次工作的时长
    df = df.withColumn(
        "work_end_date",
        F.when(
            F.lower(F.trim(df.work_end_date)).isin("至今", "今", "present"),
            df.update_date).otherwise(df.work_end_date))
    df = df.withColumn("work_duration_month",
                       F.months_between(df.work_end_date, df.work_start_date))
    df = df.withColumn(
        "work_duration_year",
        F.udf(lambda x: x / 12
              if x else -1, FloatType())(df.work_duration_month))
    # 计算毕业到现在的工作时长
    df = df.withColumn("work_month_min",
                       F.months_between(df.work_start_date, df.edu_end_date))
    df = df.withColumn("work_month_max",
                       F.months_between(df.work_end_date, df.edu_end_date))
    df = df.withColumn("work_month_duration",
                       F.months_between(df.work_end_date, df.work_start_date))
    # 将工作月份转换成年数
    df = df.withColumn("work_year_min",
                       F.udf(get_work_year, IntegerType())(df.work_month_min))
    df = df.withColumn("work_year_max",
                       F.udf(get_work_year, IntegerType())(df.work_month_max))
    df.write.mode('overwrite').save(output_path)
예제 #11
0
 def createAgeDateColumns(self, df):
     """Criar colunas de idade do cliente e data do pedido.
     
     Args:
         df: Dataframe
     
     Returns:
         Dataframe.
     """
     log(
         "INFO",
         f"[createAgeDateColumns] Criando colunas de idade e data do pedido."
     )
     return (df.withColumn(
         "data_pedido_date",
         date_format(
             from_unixtime(col('data_pedido')),
             "yyyy-MM-dd")).withColumn('idade', (months_between(
                 current_date(), col('data_nascimento_cliente')) / 12).cast(
                     IntegerType())))
예제 #12
0
# COMMAND ----------

# Get the number of different months (alias month) between the 2 following date columns. Round the column "month". Show all 3 columns
df = spark.createDataFrame([('1997-02-28 10:30:00', '1996-10-30')],
                           ['date1', 'date2'])
# Expected:
# +-------------------+----------+------+
# |              date1|     date2|months|
# +-------------------+----------+------+
# |1997-02-28 10:30:00|1996-10-30|   4.0|
# +-------------------+----------+------+

# Answer
df.select('date1', 'date2',
          F.round(F.months_between('date1', 'date2'),
                  0).alias('months')).show()

# COMMAND ----------

# Using spark.sql and function make_date, from this df, build the associated date
spark.createDataFrame([(2020, 6, 26), (1000, 2, 29), (-44, 1, 1)],
                      ['Y', 'M', 'D']).createOrReplaceTempView('YMD')
# Expected:
# +-----------+
# |       date|
# +-----------+
# | 2020-06-26|
# |       null|
# |-0044-01-01|
# +-----------+
예제 #13
0

'''Now we drop year,month,day,hour,minute,date,time columns as we will again try to create these from timestamp column that we created'''
df_nycflights = df_nycflights. \
                drop('year'). \
                drop('month'). \
                drop('day'). \
                drop('hour'). \
                drop('minute'). \
                drop('date'). \
                drop('time')

df_nycflights.show() 

'''Now we extract the fields back'''
df_nycflights = df_nycflights. \
                withColumn('year',year(df_nycflights.timestamp)). \
                withColumn('month',month(df_nycflights.timestamp)). \
                withColumn('day',dayofmonth(df_nycflights.timestamp)). \
                withColumn('hour',hour(df_nycflights.timestamp)). \
                withColumn('minute',minute(df_nycflights.timestamp))  

df_nycflights.show()

'''Now few operations on timestamp '''
df_nycflights = df_nycflights.\
                withColumn('date_sub',date_sub(df_nycflights.timestamp ,10)). \
                withColumn('date_add',date_add(df_nycflights.timestamp ,10)). \
                withColumn('months_between',months_between(df_nycflights.timestamp,df_nycflights.timestamp))

df_nycflights.show()                 
예제 #14
0
 def add_age_information(self, date: datetime) -> None:
     self._subjects = self.subjects.withColumn(
         "age", floor(months_between(lit(date), col("birthDate")) /
                      12)).withColumn("ageBucket", floor(col("age") / 5))
            from (
            select 
                last_day(period_end_dte) as period_end_dte
            ,   last_day(add_months(period_end_dte,-3)) as period_start_dte
            ,   id

            from (select distinct
            issue_dte as period_end_dte
            from work_db.inp_data
            where extract(month from issue_dte) in (3,6,9,12)
            ) as dt
            
            cross join (select distinct id from work_db.inp_data)
            
            ) as id
            
            inner join work_db.inp_data as bse on bse.id = id.id
            where bse.issue_dte <= id.period_end_dte
                and coalesce(bse.Default_dte, Maturity_Dte,cast('9999-12-31' as date)) > period_start_dte
            """)

#Checks and other s***e
ABT = ABT.withColumn(
    'TOB', floor(months_between(col('period_end_dte'), col('issue_dte')) / 3))
print('2.3.2 Pushing ABT to Hive DB')
ABT.createOrReplaceTempView("ABT")
sqlContext.sql("""create table if not exists work_db.LC_DR_Periodic as
            (select * from ABT)""")

sc.stop()
예제 #16
0
def create_extra_last_feature2(months):
    from pyspark.sql import functions as f
    from pyspark.sql.functions import date_format, to_date, to_timestamp
    import pyspark.sql.functions as func
    from pyspark.sql.functions import expr
    from pyspark.sql.window import Window
    download_df = spark.read.parquet(data_paths['download_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    browse_df = spark.read.parquet(data_paths['browse_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    share_df = spark.read.parquet(data_paths['share_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    search_df = spark.read.parquet(data_paths['search_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    fav_df = spark.read.parquet(data_paths['fav_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    fav_df2 = fav_df.withColumn("date",
                                to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    fav_df2 = fav_df2.withColumn(
        "MONTH_tmp", f.from_unixtime(f.unix_timestamp(fav_df2.date, "yyyyMM")))
    fav_df2 = fav_df2.withColumn(
        "MONTH",
        f.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    fav_df3 = fav_df2.withColumn(
        "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)"))
    fav_df3 = fav_df3.withColumn(
        "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))"))
    fav = fav_df3.withColumn("ADJ_USERID",
                             regexp_replace(f.col("ADJ_USERID"), "^0*", ""))
    fav = fav.groupby(['ADJ_USERID', 'MONTH']).count()
    fav = fav.withColumnRenamed("count", "num_fav")
    download_df2 = download_df.withColumn(
        "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    download_df2 = download_df2.withColumn(
        "MONTH_tmp",
        f.from_unixtime(f.unix_timestamp(download_df2.date, "yyyyMM")))
    download_df2 = download_df2.withColumn(
        "MONTH",
        f.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    download_df3 = download_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    download_df4 = download_df3.withColumn(
        "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", ""))
    download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    download = download.withColumnRenamed("count", "num_" + "download")
    browse_df2 = browse_df.withColumn(
        "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    browse_df2 = browse_df2.withColumn(
        "MONTH_tmp",
        f.from_unixtime(f.unix_timestamp(browse_df2.date, "yyyyMM")))
    browse_df2 = browse_df2.withColumn(
        "MONTH",
        f.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    browse_df3 = browse_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    browse_df4 = browse_df3.withColumn(
        "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", ""))
    browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    browse = browse.withColumnRenamed("count", "num_" + "browse")
    share_df2 = share_df.withColumn(
        "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    share_df2 = share_df2.withColumn(
        "MONTH_tmp", f.from_unixtime(f.unix_timestamp(share_df2.date,
                                                      "yyyyMM")))
    share_df2 = share_df2.withColumn(
        "MONTH",
        f.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    share_df3 = share_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    share_df4 = share_df3.withColumn(
        "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", ""))
    share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    share = share.withColumnRenamed("count", "num_" + "share")
    search_df2 = search_df.withColumn(
        "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    search_df2 = search_df2.withColumn(
        "MONTH_tmp",
        f.from_unixtime(f.unix_timestamp(search_df2.date, "yyyyMM")))
    search_df2 = search_df2.withColumn(
        "MONTH",
        f.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    search_df3 = search_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    search_df4 = search_df3.withColumn(
        "ADJ_USERID", regexp_replace(col("ADJ_USERID"), "^0*", ""))
    search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    search = search.withColumnRenamed("count", "num_" + "search")
    data = [("2013-01-01", str(datetime.date.today()))]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    df = df.withColumn("monthsDiff", f.months_between("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("add_months(minDate, date)")) \
        .select('date')
    df = df.withColumn(
        "MONTH", f.from_unixtime(f.unix_timestamp(f.col("date")),
                                 "yyyyMM")).select('MONTH')
    unique_id = download.select('ADJ_USERID').distinct() \
        .union(browse.select('ADJ_USERID').distinct()) \
        .union(share.select('ADJ_USERID').distinct()) \
        .union(search.select('ADJ_USERID').distinct()) \
        .union(fav.select('ADJ_USERID').distinct())
    unique_id = unique_id.distinct()
    all_abo_month = unique_id.crossJoin(df)
    combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \
        .union(share.select(['ADJ_USERID', 'MONTH'])) \
        .union(search.select(['ADJ_USERID', 'MONTH'])) \
        .union(fav.select(['ADJ_USERID', 'MONTH']))
    min_max_date = combine.groupby("ADJ_USERID").agg(f.min("MONTH"),
                                                     f.max("MONTH"))
    all_abo_month = all_abo_month.join(
        min_max_date,
        all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID,
        how='left').drop(min_max_date.ADJ_USERID)
    all_abo_month = all_abo_month.filter(f.col("MONTH") >= f.col("min(MONTH)"))
    all_abo_month = all_abo_month.filter(f.col("MONTH") <= f.col("max(MONTH)"))
    all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"])
    download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'],
                                  'left').na.fill(0)
    for n in range(1, n_max(12, n_max1(months))):
        download = download.withColumn('num_' + "download" + str(n), func.lag(download['num_' + "download"], n, 0) \
                                       .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    for i in months:
        if i in [1, 3, 6, 9, 12]:
            continue
        download = download.withColumn(
            "n_lag_currentyr_" + "download" + "_sum_{month}m".format(month=i),
            download['num_' + "download"])
        for iter in range(1, n_max(12, n_max1(months))):
            download = download.withColumn(
                "n_lag_currentyr_" + "download" +
                "_sum_{month}m".format(month=i),
                download["n_lag_currentyr_" + "download" +
                         "_sum_{month}m".format(month=i)] +
                download['num_' + "download" + str(iter)])
    droplist = []
    for n in range(1, n_max(12, n_max1(months))):
        droplist = droplist + ['num_' + "download" + str(n)]
    download = download.drop(*droplist)
    browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, n_max(12, n_max1(months))):
        browse = browse.withColumn('num_' + "browse" + str(n), func.lag(browse['num_' + "browse"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    for i in months:
        if i in [1, 3, 6, 9, 12]:
            continue
        browse = browse.withColumn(
            "n_lag_currentyr_" + "browse" + "_sum_{month}m".format(month=i),
            browse['num_' + "browse"])
        for iter in range(1, n_max(12, n_max1(months))):
            browse = browse.withColumn(
                "n_lag_currentyr_" + "browse" +
                "_sum_{month}m".format(month=i),
                browse["n_lag_currentyr_" + "browse" +
                       "_sum_{month}m".format(month=i)] +
                browse['num_' + "browse" + str(iter)])
    droplist = []
    for n in range(1, n_max(12, n_max1(months))):
        droplist = droplist + ['num_' + "browse" + str(n)]
    browse = browse.drop(*droplist)
    share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'],
                               'left').na.fill(0)
    for n in range(1, n_max(12, n_max1(months))):
        share = share.withColumn('num_' + "share" + str(n), func.lag(share['num_' + "share"], n, 0) \
                                 .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    for i in months:
        if i in [1, 3, 6, 9, 12]:
            continue
        share = share.withColumn(
            "n_lag_currentyr_" + "share" + "_sum_{month}m".format(month=i),
            share['num_' + "share"])
        for iter in range(1, n_max(12, n_max1(months))):
            share = share.withColumn(
                "n_lag_currentyr_" + "share" + "_sum_{month}m".format(month=i),
                share["n_lag_currentyr_" + "share" +
                      "_sum_{month}m".format(month=i)] +
                share['num_' + "share" + str(iter)])
    droplist = []
    for n in range(1, n_max(12, n_max1(months))):
        droplist = droplist + ['num_' + "share" + str(n)]
    share = share.drop(*droplist)
    search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, n_max(12, n_max1(months))):
        search = search.withColumn('num_' + "search" + str(n), func.lag(search['num_' + "search"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    for i in months:
        if i in [1, 3, 6, 9, 12]:
            continue
        search = search.withColumn(
            "n_lag_currentyr_" + "search" + "_sum_{month}m".format(month=i),
            search['num_' + "search"])
        for iter in range(1, n_max(12, n_max1(months))):
            search = search.withColumn(
                "n_lag_currentyr_" + "search" +
                "_sum_{month}m".format(month=i),
                search["n_lag_currentyr_" + "search" +
                       "_sum_{month}m".format(month=i)] +
                search['num_' + "search" + str(iter)])
    droplist = []
    for n in range(1, n_max(12, n_max1(months))):
        droplist = droplist + ['num_' + "search" + str(n)]
    search = search.drop(*droplist)
    fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0)
    for n in range(1, n_max(12, n_max1(months))):
        fav = fav.withColumn('num_' + "fav" + str(n), func.lag(fav['num_' + "fav"], n, 0) \
                             .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    for i in months:
        if i in [1, 3, 6, 9, 12]:
            continue
        fav = fav.withColumn(
            "n_lag_currentyr_" + "fav" + "_sum_{month}m".format(month=i),
            fav['num_' + "fav"])
        for iter in range(1, n_max(12, n_max1(months))):
            fav = fav.withColumn(
                "n_lag_currentyr_" + "fav" + "_sum_{month}m".format(month=i),
                fav["n_lag_currentyr_" + "fav" +
                    "_sum_{month}m".format(month=i)] +
                fav['num_' + "fav" + str(iter)])
    droplist = []
    for n in range(1, n_max(12, n_max1(months))):
        droplist = droplist + ['num_' + "fav" + str(n)]
    fav = fav.drop(*droplist)
    classroom_data = all_abo_month.join(
        download, ['ADJ_USERID', 'MONTH'],
        'left').join(browse, ['ADJ_USERID', 'MONTH'],
                     'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join(
                         search, ['ADJ_USERID', 'MONTH'],
                         'left').join(fav, ['ADJ_USERID', 'MONTH'],
                                      'left').na.fill(0)
    classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no")
    classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no")
    classroom_data = classroom_data.withColumn(
        'mo_yr_key_no', classroom_data.mo_yr_key_no.cast('string'))
    classroom_data = classroom_data.withColumn(
        'mo_yr_key_no', to_timestamp(classroom_data.mo_yr_key_no, 'yyyyMM'))
    classroom_data = classroom_data.withColumn(
        'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd'))
    classroom_data = classroom_data.drop(
        *['num_download', 'num_browse', 'num_share', 'num_search', 'num_fav'])
    from pyspark.sql.window import Window
    import pyspark.sql.functions as f
    # from pyspark.sql.functions import *
    from pyspark.sql.types import IntegerType
    from pyspark.sql.functions import date_format, to_date, to_timestamp
    wechat_mini = spark.read.option("delimiter", "\t").option(
        "header",
        "true").option("encoding",
                       "UTF-8").csv(data_paths['wechat_miniprogram_input'])
    wechat_mini = wechat_mini.withColumn('time',
                                         to_timestamp('时间戳', 'yyyy-MM-dd'))
    wechat_mini = wechat_mini.withColumn('month',
                                         to_timestamp('时间戳', 'yyyy-MM'))
    wechat_mini = wechat_mini.withColumn('month',
                                         date_format('month', 'yyyyMM'))
    wechat_mini2 = wechat_mini.withColumnRenamed('事件类型', 'event_type') \
        .withColumnRenamed('时间戳', 'timestamp') \
        .withColumnRenamed('诸葛id', 'trip_id') \
        .withColumnRenamed('事件id', 'event_id') \
        .withColumnRenamed('事件名', 'event_name') \
        .withColumnRenamed('商品id', 'product_id') \
        .withColumnRenamed('商品名称', 'product_name') \
        .withColumnRenamed('搜索词', 'search_word')
    # clean up imc_no
    wechat_mini3 = wechat_mini2.withColumn("leading360",
                                           expr("substring(amwayid, 1, 3)"))
    wechat_mini4 = wechat_mini3.withColumn(
        "ADJ_USERID",
        when(
            f.col("leading360") == "360",
            expr("substring(amwayid, 4, length(amwayid)-2)")).otherwise(
                f.col("amwayid")))
    wechat_mini5 = wechat_mini4.withColumn(
        "imc_no", regexp_replace(col("ADJ_USERID"), "^0*", ""))
    wechat_mini_all = wechat_mini5.withColumn(
        "imc_no",
        when(
            f.col("leading360") == "360",
            expr("substring(imc_no, 1, length(imc_no)-2)")).otherwise(
                f.col("imc_no")))
    # browse
    wechat_mini_browse = wechat_mini_all.where((f.col("event_type") == '页面浏览'))
    wechat_mini_browse2 = wechat_mini_browse.groupBy('imc_no', 'month').agg(
        f.count("event_id").alias("n_num_cloudcommerce_browse"))
    # search
    wechat_mini_search = wechat_mini_all.where((f.col("event_type") == '站内搜索'))
    wechat_mini_search2 = wechat_mini_search.groupBy('imc_no', 'month').agg(
        f.count("event_id").alias("n_num_cloudcommerce_search"))
    # order
    wechat_mini_order = wechat_mini_all.where(
        (f.col("event_name") == '小程序_订单确认'))
    wechat_mini_order2 = wechat_mini_order.groupBy('imc_no', 'month').agg(
        f.count("event_id").alias("n_num_cloudcommerce_order"))
    # cart
    purchase_trip = wechat_mini_order.select('trip_id').distinct()
    wechat_mini_cart = wechat_mini_all.join(
        purchase_trip, 'trip_id', 'inner').where(
            (f.col("event_type") == '商品加购'))
    wechat_mini_cart2 = wechat_mini_cart.groupBy(
        'imc_no', 'month', 'trip_id').agg(
            f.count("product_id").alias(
                "n_num_cloudcommerce_product_per_cart"))
    wechat_mini_cart3 = wechat_mini_cart2.groupBy('imc_no', 'month').agg(
        f.avg("n_num_cloudcommerce_product_per_cart").alias(
            "n_num_cloudcommerce_product_per_cart"))
    # all abo and month combination
    unique_id = wechat_mini_all.select('imc_no').distinct()
    month = wechat_mini_all.select('month').distinct()
    all_abo_month = unique_id.crossJoin(month)
    min_max_date = wechat_mini_all.groupby("imc_no").agg(
        f.min("month"), f.max("month"))
    all_abo_month = all_abo_month.join(
        min_max_date, all_abo_month.imc_no == min_max_date.imc_no,
        how='left').drop(min_max_date.imc_no)
    all_abo_month = all_abo_month.filter(f.col("month") >= f.col("min(month)"))
    all_abo_month = all_abo_month.filter(f.col("month") <= f.col("max(month)"))
    # join everything together
    combine1 = all_abo_month.join(wechat_mini_browse2, ['imc_no', 'month'],
                                  'left').na.fill(0)
    combine2 = combine1.join(wechat_mini_search2, ['imc_no', 'month'],
                             'left').na.fill(0)
    combine3 = combine2.join(wechat_mini_order2, ['imc_no', 'month'],
                             'left').na.fill(0)
    combine4 = combine3.join(wechat_mini_cart3, ['imc_no', 'month'],
                             'left').na.fill(0)
    # create lag features
    combine = combine4.withColumnRenamed("month", "mo_yr_key_no")
    feature_list = [
        'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search',
        'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart'
    ]
    lag_features = months
    for feature in feature_list:
        for lag_mo in lag_features:
            if lag_mo in [1, 3, 6, 9, 12]:
                continue
            for lag in range(0, lag_mo):
                colname = feature + "_" + str(lag)
                feature_col = feature + "_sum_" + str(lag_mo) + "m"
                combine = combine.withColumn(
                    colname,
                    f.lag(combine[feature], lag).over(
                        Window.partitionBy("imc_no").orderBy("mo_yr_key_no")))
                if lag == 0:
                    combine = combine.withColumn(feature_col, combine[colname])
                else:
                    combine = combine.withColumn(
                        feature_col, combine[feature_col] + combine[colname])
    main_col = ['imc_no', 'mo_yr_key_no']
    selected_feature = []
    for feature in feature_list:
        for lag_mo in lag_features:
            if lag_mo in [1, 3, 6, 9, 12]:
                continue
            feature_col = feature + "_sum_" + str(lag_mo) + "m"
            selected_feature.append(feature_col)
    selected_feature = main_col + feature_list + selected_feature
    wechat_cloudcommerce = combine.select(selected_feature)
    wechat_formatting = wechat_cloudcommerce
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', wechat_formatting.mo_yr_key_no.cast('string'))
    # wechat_formatting = wechat_formatting.withColumn('imc_no',wechat_formatting.imc_no.cast('string'))
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', to_timestamp(wechat_formatting.mo_yr_key_no, 'yyyyMM'))
    wechat_formatting = wechat_formatting.withColumn(
        'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd'))
    wechat_cloudcommerce = wechat_formatting
    wechat_cloudcommerce = wechat_cloudcommerce.drop(*[
        'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search',
        'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart'
    ])
    return classroom_data, wechat_cloudcommerce
예제 #17
0
                              'transactions_w_reg_init_date_after_2015.csv',
                              header=True,
                              inferSchema=True)
transactions.count()
transactions.printSchema()
transactions = transactions.select(
    ['msno', 'is_churn_final', 'transaction_date'])

my_window = Window.partitionBy('msno').orderBy('transaction_date')

transactions = transactions.withColumn(
    'prev_transaction_date',
    F.lag(transactions.transaction_date).over(my_window))

transactions = transactions.withColumn('months_since_prev_transaction',\
    F.months_between(transactions.transaction_date, transactions.prev_transaction_date))
transactions.printSchema()
transactions = transactions.where(
    F.col('months_since_prev_transaction').isNotNull())

transactions.createOrReplaceTempView('trans')
q = 'SELECT first(msno) as msno, first(is_churn_final) as is_churn_final, round(sum(months_since_prev_transaction), 0) as month ' \
    'FROM trans ' \
    'GROUP BY msno '
joined_2015_or_later = spark.sql(q)

joined_2015_or_later.show(50)

joined_2015_or_later = joined_2015_or_later.toPandas()
joined_2015_or_later.head()
joined_2015_or_later.is_churn_final.mean()
    date_sub(col("today"), 5).alias('5 days ago'),
    date_add(col("today"), 5).alias('5 days ahead')).show(1)
date_df.selectExpr('(date_sub(today, 5)) as `5 days ago`',
                   'date_add(today, 5) as `5 days togo`').show(1, False)
#%%
"number of days and months between 2 dates"
from pyspark.sql.functions import datediff, months_between, to_date, lit, expr
print(
    date_df.withColumn("week_ago", date_sub(col("today"), 7)).select(
        datediff(col("week_ago"), col("today"))).show(1))

print(
    date_df.select(
        to_date(lit("2016-01-01")).alias("start"),
        to_date(lit("2017-05-22")).alias("end")).select(
            months_between(col("start"), col("end"))).show(1))

print(
    date_df.withColumn('start', to_date(lit("2014-01-01"))).withColumn(
        'end', to_date(lit("2017-05-22"))).select(
            months_between(col("start"), col("end"))).show(1))
#%%
"providing date format"
from pyspark.sql.functions import to_date

dateFormat = "yyyy-dd-MM"

cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
    to_date(lit("2017-20-12"), dateFormat).alias("date2"))
cleanDateDF.show()
예제 #19
0
for i in months:
    df = df.withColumn(
        'issue_d',
        F.regexp_replace('issue_d', i, months[i]).alias('issue_d'))
    df = df.withColumn(
        'last_pymnt_d',
        F.regexp_replace('last_pymnt_d', i, months[i]).alias('last_pymnt_d'))

df = df.withColumn('issue_d', (F.to_date('issue_d',
                                         'MM-dd-yyyy')).alias('issue_d'))
df = df.withColumn('last_pymnt_d',
                   (F.to_date('last_pymnt_d',
                              'MM-dd-yyyy')).alias('last_pymnt_d'))
df = df.withColumn(
    'mnth_start2last',
    F.months_between(df.last_pymnt_d, df.issue_d).alias('mnth_start2last'))
df = df.withColumn('fracNumPmts', F.col('mnth_start2last') / F.col('term2'))

#Optional
## start and end dates based on last payment date:  only when using large samples
#start_date = '2011-01-01'
#end_date = '2013-12-31'

#Early repayments between certain dates
#df = df.filter((df.last_pymnt_d >= start_date)&(df.last_pymnt_d < end_date))
#df.count()

print('\n\n\nWord2vec...\n')

#for tokenizer change null to none
df = df.na.fill({'desc': 'none'})  #perhaps doesn't matter?
예제 #20
0
def toGetRawData(maxmm, syymm, syear):
    new_df = spark.sql(
        """select * from stats.nn_transstyleprice where year_of_sale*100+month_of_sale<={maxmm} and year_of_sale*100+month_of_sale>={syymm} \
                 and vehicle_code>0 and year>={syear} and make!='Tesla'""".
        format(syear=syear, maxmm=maxmm,
               syymm=syymm))  #catch all updated data from May 18 for daily run
    new_df = new_df.select([F.col(x).alias(x.lower()) for x in new_df.columns
                            ]).withColumn('year',
                                          F.col('year').cast("integer"))
    tmvlist = spark.sql(
        "select ed_style_id, year,make,model,tmv_category,model_year_link_code,model_level_link_code,msr_price from \
                    stats.try_nn_newvehiclesf where tmv_category!='R' and    make!='Tesla' and make!='Polestar'"
    )
    tmvlistextra = spark.sql(
        "select ed_style_id, year,make,model,tmv_category,null as model_year_link_code, null as model_level_link_code,total_original_msrp as msr_price from \
                    stats.try_nn_newvehiclesextraf where tmv_category!='R' and    make!='Tesla' and make!='Polestar'"
    )
    tmvlist = tmvlist.union(tmvlistextra).withColumn(
        'year',
        F.col('year').cast("integer"))
    print(tmvlist.count())
    tmvlist.write.format("delta").option(
        "overwriteSchema",
        "true").mode("overwrite").saveAsTable("stats.nn_shortterm_tmvlist")
    tmvlist = spark.sql("select * from stats.nn_shortterm_tmvlist")
    agemaster = spark.sql("select * from stats.nn_vehicleagemaster")

    extralist = spark.sql("select * from stats.nn_shortterm_extraagelist")
    agemaster=agemaster.join(extralist, ['year','make','model'],how='left').withColumn("yymminitial",F.when(F.col("age1")>0,F.col('age1')).otherwise(F.col('yymminitial'))).drop('age1') \
           .withColumn("earliestmm",(F.col('year')-1)*100+2) \
           .withColumn("yymminitial",F.when(F.col("yymminitial")<F.col("earliestmm"),F.col('earliestmm')) \
           .otherwise(F.col('yymminitial'))).drop('earliestmm')
    catfinal = spark.sql("select * from stats.try_catfinal").cache()
    ed4list=spark.sql("select  link_make as make,link_model as model, max(ed4) as ed4 from stats.try_yrmdcatorigin where saleyear>2016 group by make,model") \
                        .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model"))).dropDuplicates(subset=["make","model"]) \
                        .join(catfinal,['ed4'],how='left').cache()
    originlist=spark.sql("select  distinct link_make,origin from stats.try_yrmdcatorigin where saleyear>2016") \
             .withColumn("make",F.lower(F.col("link_make"))).drop('link_make').dropDuplicates(subset=["make"]).cache()
    transdata=new_df \
    .withColumnRenamed("vehicle_code","ed_style_id") \
    .withColumnRenamed("year_of_sale","saleyear") \
    .withColumnRenamed("month_of_sale","salemonth") \
    .withColumn("syrmm",F.col('saleyear')*100+F.col('salemonth'))\
    .withColumn("ageyear",age_year(F.col("saleyear"),F.col("year"))) \
    .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model")))\
    .fillna(100,subset=['dtt']) \
    .withColumn("item",F.col("ed_style_id").cast(T.StringType())) \
    .join(agemaster,['year','make','model'],how='left') \
     .withColumn("ptg",F.round(F.col("trans_price")/F.col("trans_msrp"),5))  \
    .join(ed4list,["make","model"],how="left") \
    .join(originlist,["make"],how="left") \
    .join(tmvlist.select("ed_style_id", "tmv_category", "model_year_link_code","model_level_link_code","msr_price"),["ed_style_id"],how="left")\
    .withColumn("msrpf", F.when(F.col("trans_msrp")<F.col("msr_price"),F.col("msr_price")).otherwise(F.col("trans_msrp"))) \
    .withColumn("ptgf", F.when(F.col("ptg")>1.05,1.05).otherwise(F.col("ptg"))) \
    .withColumn("pricef", F.round(F.col("ptgf")*F.col("msrpf")))\
    .cache()
    vehiclelist=tmvlist \
    .withColumn("make",F.lower(F.col("make"))).withColumn("model",F.lower(F.col("model")))\
    .withColumn("item",F.col("ed_style_id").cast(T.StringType())) \
    .join(agemaster,['year','make','model'],how='left') \
    .join(ed4list,["make","model"],how="left") \
    .join(originlist,["make"],how="left") \
    .withColumn('yymmref',F.col("year")*100+F.lit(7)) \
    .withColumn("yymminitial",F.when(F.col("yymmref")<F.col("yymminitial"),F.col("yymmref")).otherwise(F.col("yymminitial"))) \
     .withColumn('iyear',F.col('yymminitial').substr(1,4).cast("integer")) \
     .withColumn('imonth',F.col('yymminitial').substr(5,6).cast("integer")) \
     .withColumn("idate",F.to_date(F.concat_ws("-","iyear","imonth",F.lit(1)))) \
     .withColumn("fdate",F.lit(fdate)) \
     .withColumn("age",F.months_between(F.col("fdate"),F.col('idate'))+1) \
     .withColumn("age",F.when(F.col('age')<1,1).otherwise(F.col('age')))\
     .withColumn("pdate",F.expr("add_months(fdate,0)")) \
     .withColumn("saleyear", F.year('pdate')) \
     .withColumn("salemonth", F.month('pdate'))\
     .withColumn("syrmm",F.col('saleyear')*100+F.col('salemonth'))\
     .withColumn("ageyear",age_year(F.col("saleyear"),F.col("year"))) \
     .cache()
    print(transdata.count())
    print(vehiclelist.count())
    return [transdata, vehiclelist]
예제 #21
0
df = df.withColumn('dti',df['dti'].cast(DoubleType()).alias('dti'))
df = df.withColumn('int_rate',df['int_rate'].cast(DoubleType()).alias('int_rate'))
df = df.withColumn('term2', df['term'].cast(DoubleType()).alias('term2'))

#and handle date strings which are Mon-YYYY format
months={'Jan':'01-01','Feb':'02-15','Mar':'03-15','Apr':'04-15','May':'05-15','Jun':'06-15',\
	'Jul':'07-15','Aug':'08-15','Sep':'09-15','Oct':'10-15','Nov':'11-15','Dec':'12-31'}

for i in months:
	df = df.withColumn('issue_d', F.regexp_replace('issue_d', i , months[i]).alias('issue_d'))
	df = df.withColumn('last_pymnt_d', F.regexp_replace('last_pymnt_d', i , months[i]).alias('last_pymnt_d'))


df = df.withColumn('issue_d', (F.to_date('issue_d', 'MM-dd-yyyy')).alias('issue_d'))
df = df.withColumn('last_pymnt_d', (F.to_date('last_pymnt_d', 'MM-dd-yyyy')).alias('last_pymnt_d'))
df = df.withColumn('mnth_start2last', F.months_between(df.last_pymnt_d, df.issue_d).alias('mnth_start2last'))
df = df.withColumn('fracNumPmts', F.col('mnth_start2last')/F.col('term2'))
	
#Indicators for default, early repayment	


df2 = df #keep a backup
#then drop rows with leftover na's
df = df.na.drop(how='any')

#for large sample size, use a subsample
df = df.sample(fraction=0.1)

#if df.count()>0:
#	del(df2) #if df is okay (did not lose all data!)
예제 #22
0
def create_classroom_feature(pargs, params):
    """
    :inputs: download_df, browse_df, share_df, search_df, fav_df
    :outputs: classroom_data
    """

    download_df = spark.read.parquet(data_paths['download_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    browse_df = spark.read.parquet(data_paths['browse_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    share_df = spark.read.parquet(data_paths['share_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    search_df = spark.read.parquet(data_paths['search_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))
    fav_df = spark.read.parquet(data_paths['fav_df'].format(
        run_mode=run['run_mode'], run_id=run['run_id']))

    fav_df2 = fav_df.withColumn("date",
                                to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    fav_df2 = fav_df2.withColumn(
        "MONTH_tmp", F.from_unixtime(F.unix_timestamp(fav_df2.date, "yyyyMM")))
    fav_df2 = fav_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    fav_df3 = fav_df2.withColumn(
        "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)"))
    fav_df3 = fav_df3.withColumn(
        "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))"))
    fav = fav_df3.withColumn("ADJ_USERID",
                             regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    fav = fav.groupby(['ADJ_USERID', 'MONTH']).count()
    fav = fav.withColumnRenamed("count", "num_fav")

    download_df2 = download_df.withColumn(
        "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    download_df2 = download_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(download_df2.date, "yyyyMM")))
    download_df2 = download_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    download_df3 = download_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    download_df4 = download_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    download = download.withColumnRenamed("count", "num_" + "download")

    browse_df2 = browse_df.withColumn(
        "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    browse_df2 = browse_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(browse_df2.date, "yyyyMM")))
    browse_df2 = browse_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    browse_df3 = browse_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    browse_df4 = browse_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    browse = browse.withColumnRenamed("count", "num_" + "browse")

    share_df2 = share_df.withColumn(
        "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    share_df2 = share_df2.withColumn(
        "MONTH_tmp", F.from_unixtime(F.unix_timestamp(share_df2.date,
                                                      "yyyyMM")))
    share_df2 = share_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    share_df3 = share_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    share_df4 = share_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    share = share.withColumnRenamed("count", "num_" + "share")

    search_df2 = search_df.withColumn(
        "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss'))
    search_df2 = search_df2.withColumn(
        "MONTH_tmp",
        F.from_unixtime(F.unix_timestamp(search_df2.date, "yyyyMM")))
    search_df2 = search_df2.withColumn(
        "MONTH",
        F.concat(expr("substring(MONTH_tmp, 1, 4)"),
                 expr("substring(MONTH_tmp, 6, 2)")))
    search_df3 = search_df2.withColumn(
        "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)"))
    search_df4 = search_df3.withColumn(
        "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", ""))
    search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count()
    search = search.withColumnRenamed("count", "num_" + "search")

    data = [("2013-01-01", str(datetime.date.today()))]

    df = spark.createDataFrame(data, ["minDate", "maxDate"])

    df = df.withColumn("monthsDiff", F.months_between("maxDate", "minDate")) \
        .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", F.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", F.expr("add_months(minDate, date)")) \
        .select('date')
    df = df.withColumn(
        "MONTH", F.from_unixtime(F.unix_timestamp(F.col("date")),
                                 "yyyyMM")).select('MONTH')
    unique_id = download.select('ADJ_USERID').distinct() \
        .union(browse.select('ADJ_USERID').distinct()) \
        .union(share.select('ADJ_USERID').distinct()) \
        .union(search.select('ADJ_USERID').distinct()) \
        .union(fav.select('ADJ_USERID').distinct())
    unique_id = unique_id.distinct()
    all_abo_month = unique_id.crossJoin(df)
    combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \
        .union(share.select(['ADJ_USERID', 'MONTH'])) \
        .union(search.select(['ADJ_USERID', 'MONTH'])) \
        .union(fav.select(['ADJ_USERID', 'MONTH']))
    min_max_date = combine.groupby("ADJ_USERID").agg(F.min("MONTH"),
                                                     F.max("MONTH"))
    all_abo_month = all_abo_month.join(
        min_max_date,
        all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID,
        how='left').drop(min_max_date.ADJ_USERID)
    all_abo_month = all_abo_month.filter(F.col("MONTH") >= F.col("min(MONTH)"))
    all_abo_month = all_abo_month.filter(F.col("MONTH") <= F.col("max(MONTH)"))

    all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"])

    download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'],
                                  'left').na.fill(0)
    for n in range(1, 12):
        download = download.withColumn('num_' + "download" + str(n), F.lag(download['num_' + "download"], n, 0) \
                                       .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_3m",
        download['num_' + "download"] + download['num_' + "download" + "1"] +
        download['num_' + "download" + "2"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_6m",
        download["n_lag_currentyr_" + "download" + "_sum_3m"] +
        download['num_' + "download" + "3"] +
        download['num_' + "download" + "4"] +
        download['num_' + "download" + "5"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_9m",
        download["n_lag_currentyr_" + "download" + "_sum_6m"] +
        download['num_' + "download" + "6"] +
        download['num_' + "download" + "7"] +
        download['num_' + "download" + "8"])
    download = download.withColumn(
        "n_lag_currentyr_" + "download" + "_sum_12m",
        download["n_lag_currentyr_" + "download" + "_sum_9m"] +
        download['num_' + "download" + "9"] +
        download['num_' + "download" + "10"] +
        download['num_' + "download" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "download" + str(n)]
    download = download.drop(*droplist)

    browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, 12):
        browse = browse.withColumn('num_' + "browse" + str(n), F.lag(browse['num_' + "browse"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_3m", browse['num_' + "browse"] +
        browse['num_' + "browse" + "1"] + browse['num_' + "browse" + "2"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_6m",
        browse["n_lag_currentyr_" + "browse" + "_sum_3m"] +
        browse['num_' + "browse" + "3"] + browse['num_' + "browse" + "4"] +
        browse['num_' + "browse" + "5"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_9m",
        browse["n_lag_currentyr_" + "browse" + "_sum_6m"] +
        browse['num_' + "browse" + "6"] + browse['num_' + "browse" + "7"] +
        browse['num_' + "browse" + "8"])
    browse = browse.withColumn(
        "n_lag_currentyr_" + "browse" + "_sum_12m",
        browse["n_lag_currentyr_" + "browse" + "_sum_9m"] +
        browse['num_' + "browse" + "9"] + browse['num_' + "browse" + "10"] +
        browse['num_' + "browse" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "browse" + str(n)]
    browse = browse.drop(*droplist)

    share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'],
                               'left').na.fill(0)
    for n in range(1, 12):
        share = share.withColumn('num_' + "share" + str(n), F.lag(share['num_' + "share"], n, 0) \
                                 .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_3m", share['num_' + "share"] +
        share['num_' + "share" + "1"] + share['num_' + "share" + "2"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_6m",
        share["n_lag_currentyr_" + "share" + "_sum_3m"] +
        share['num_' + "share" + "3"] + share['num_' + "share" + "4"] +
        share['num_' + "share" + "5"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_9m",
        share["n_lag_currentyr_" + "share" + "_sum_6m"] +
        share['num_' + "share" + "6"] + share['num_' + "share" + "7"] +
        share['num_' + "share" + "8"])
    share = share.withColumn(
        "n_lag_currentyr_" + "share" + "_sum_12m",
        share["n_lag_currentyr_" + "share" + "_sum_9m"] +
        share['num_' + "share" + "9"] + share['num_' + "share" + "10"] +
        share['num_' + "share" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "share" + str(n)]
    share = share.drop(*droplist)

    search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'],
                                'left').na.fill(0)
    for n in range(1, 12):
        search = search.withColumn('num_' + "search" + str(n), F.lag(search['num_' + "search"], n, 0) \
                                   .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_3m", search['num_' + "search"] +
        search['num_' + "search" + "1"] + search['num_' + "search" + "2"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_6m",
        search["n_lag_currentyr_" + "search" + "_sum_3m"] +
        search['num_' + "search" + "3"] + search['num_' + "search" + "4"] +
        search['num_' + "search" + "5"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_9m",
        search["n_lag_currentyr_" + "search" + "_sum_6m"] +
        search['num_' + "search" + "6"] + search['num_' + "search" + "7"] +
        search['num_' + "search" + "8"])
    search = search.withColumn(
        "n_lag_currentyr_" + "search" + "_sum_12m",
        search["n_lag_currentyr_" + "search" + "_sum_9m"] +
        search['num_' + "search" + "9"] + search['num_' + "search" + "10"] +
        search['num_' + "search" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "search" + str(n)]
    search = search.drop(*droplist)

    fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0)
    for n in range(1, 12):
        fav = fav.withColumn('num_' + "fav" + str(n), F.lag(fav['num_' + "fav"], n, 0) \
                             .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH")))
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_3m", fav['num_' + "fav"] +
        fav['num_' + "fav" + "1"] + fav['num_' + "fav" + "2"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_6m",
        fav["n_lag_currentyr_" + "fav" + "_sum_3m"] +
        fav['num_' + "fav" + "3"] + fav['num_' + "fav" + "4"] +
        fav['num_' + "fav" + "5"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_9m",
        fav["n_lag_currentyr_" + "fav" + "_sum_6m"] +
        fav['num_' + "fav" + "6"] + fav['num_' + "fav" + "7"] +
        fav['num_' + "fav" + "8"])
    fav = fav.withColumn(
        "n_lag_currentyr_" + "fav" + "_sum_12m",
        fav["n_lag_currentyr_" + "fav" + "_sum_9m"] +
        fav['num_' + "fav" + "9"] + fav['num_' + "fav" + "10"] +
        fav['num_' + "fav" + "11"])
    droplist = []
    for n in range(1, 12):
        droplist = droplist + ['num_' + "fav" + str(n)]
    fav = fav.drop(*droplist)

    classroom_data = all_abo_month.join(
        download, ['ADJ_USERID', 'MONTH'],
        'left').join(browse, ['ADJ_USERID', 'MONTH'],
                     'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join(
                         search, ['ADJ_USERID', 'MONTH'],
                         'left').join(fav, ['ADJ_USERID', 'MONTH'],
                                      'left').na.fill(0)
    classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no")
    classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no")

    df = classroom_data
    df = df.withColumn('mo_yr_key_no', df.mo_yr_key_no.cast('string'))
    df = df.withColumn('mo_yr_key_no', to_timestamp(df.mo_yr_key_no, 'yyyyMM'))
    df = df.withColumn('mo_yr_key_no', date_format('mo_yr_key_no',
                                                   'yyyy-MM-dd'))
    classroom_data = df

    print("now saving the data")
    classroom_data.write.parquet(data_paths['classroom_data'].format(
        run_mode=run['run_mode'], run_id=run['run_id']),
                                 mode='overwrite')
# ### Age: from date of birth 'dob'

# In[34]:

users_raw.select('userId', 'timestamp', 'dob').show(5)

# In[35]:

# users_raw.select('timestamp', to_date('timestamp')).show(5)

# In[36]:

# compute user's age
users_age = users_raw.select(
    'userId',
    round(months_between(users_raw['timestamp'], users_raw['dob']) / 12,
          0).alias("age"))

# In[37]:

# check few results:
users_age.show(3)

# In[38]:

users_age.describe().toPandas().transpose()

# # BuyClicks
# purchase behavior of users
# * based on buyId, price
# * amount spent: total, yearly, monthly?
예제 #24
0
    def _compute_longitudinal_age_groups(
            self, cohort: Cohort,
            col_offset: int) -> Tuple[DataFrame, List[str]]:
        """
        Parameters
        ----------
        cohort: Cohort
            cohort on which the age groups should be computed
        col_offset: int
            number of columns used by lagged exposure features

        Returns
        -------
        (age_features, mapping): Tuple(DataFrame, List(str))
            a dataframe containing the age features in aij format and a mapping giving
            the correspondence between column number and age group.
        """
        # This implementation is suboptimal, but we need to have something
        # working with inconsistent python versions across the cluster.
        assert (cohort.has_subject_information(
        )), "Cohort subjects should have gender and birthdate information"

        subjects = cohort.subjects.select("patientID", "gender", "birthDate")

        bucket_ids = sf.array([sf.lit(i) for i in range(self.n_buckets)])
        subjects = (subjects.withColumn("bucketID", bucket_ids).select(
            "PatientID",
            "gender",
            "birthDate",
            sf.explode("bucketID").alias("bucket"),
        ).withColumn("dateShift",
                     sf.col("bucket") * self.bucket_size).withColumn(
                         "referenceDate", sf.lit(self.age_reference_date)))
        # Longitudinal age is based on referenceDate instead of minDate to
        # be consistent with cohort definition.
        time_references = sf.expr("date_add(referenceDate, dateShift)")
        longitudinal_age = sf.floor(
            sf.months_between(time_references, sf.col("birthdate")) / 12)
        subjects = subjects.withColumn("longitudinalAge",
                                       longitudinal_age).select(
                                           "patientID", "gender", "birthDate",
                                           "bucket", "longitudinalAge")

        subjects, n_age_groups, mapping = self._bucketize_age_column(
            subjects, "longitudinalAge", "longitudinalAgeBucket")

        assert n_age_groups == self.n_age_groups, (
            "Computed number of age groups is different from the number of specified"
            " age groups at initialization. There might be empty age_groups,"
            " you should investigate this.")

        age_features = subjects.select(
            sf.col("patientID"),
            sf.col("bucket").alias("rowIndex"),
            (sf.col("longitudinalAgeBucket") + col_offset).alias("colIndex"),
        )

        # Remove "age events" which are not in follow-up
        fup_events = self.followups.intersection(self.final_cohort).events
        fup_events = self._discretize_start_end(fup_events)
        fup_events = rename_df_columns(fup_events, prefix="fup_")
        age_features_columns = age_features.columns
        age_features = age_features.join(fup_events, on="patientID")
        age_features = age_features.where(
            sf.col("rowIndex").between(sf.col("fup_startBucket"),
                                       sf.col("fup_endBucket")))
        age_features = age_features.select(*age_features_columns)

        return age_features, mapping
예제 #25
0
from pyspark.sql.functions import col, lit, to_date, current_date, \
    current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()

    spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show()

    spark.read.jdbc

    spark.range(5)\
         .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\
         .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show()

    spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'),
                          to_date(lit('2019-01-05')).alias('date2'))\
                  .select(datediff(col('date2'), col('date1')),
                          months_between(col('date2'), col('date1'))).show()

    spark.range(5).withColumn('date', to_date(
        lit('2019-XX-XX'))).show()  #No emite excepcion

    spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \
                  .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \
        .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \
                  .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show()

    spark.range(5).select(current_timestamp().alias("timestamp")).select(
        hour(col('date')))
# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

dateDF.select(
    to_date(lit("2016-01-01")).alias("start"),
    to_date(lit("2017-05-22")).alias("end"))\
  .select(months_between(col("start"), col("end"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date, lit
spark.range(5).withColumn("date", lit("2017-01-01"))\
  .select(to_date(col("date"))).show(1)


# COMMAND ----------

from pyspark.sql.functions import to_date
dateFormat = "yyyy-dd-MM"
cleanDateDF = spark.range(1).select(
    to_date(lit("2017-12-11"), dateFormat).alias("date"),
예제 #27
0
#df_1.select(substring_index("value", "_", 1)).show()

# COMMAND ----------

#Date
from pyspark.sql.functions import current_timestamp, current_date, date_format, dayofyear, year, month, date_add, date_sub, datediff, add_months, months_between, next_day, last_day, date_trunc, lit
orders_new_col.select(
    current_timestamp(), current_date(),
    date_format(current_timestamp(), "yyyy-MM-dd a hh:HH:mm").alias("dat"),
    dayofyear(current_timestamp()).alias("dayofyear_val"),
    month(current_timestamp()).alias("mon"),
    date_sub(current_timestamp(), 1).alias("date_d")).select(
        datediff(current_timestamp(), "date_d")).show(1, truncate=False)
orders_new_col.select(add_months(
    current_timestamp(), -1).alias("prev_mon")).select(
        months_between(current_timestamp(),
                       "prev_mon").alias("no_of_mon")).select(
                           next_day(current_timestamp(), "Mon")).select(
                               last_day(current_timestamp())).show(1)
orders_new_col.select(date_trunc('year', lit('2020-04-01'))).show(1)

# COMMAND ----------

from pyspark.sql.functions import concat_ws
#orders_new_col.printSchema()
#spark.sql("create database if not exists newdb")
#spark.sql("show databases").show()
ordersText = orders_new_col.select(
    concat_ws("~", "order_id", "order_date", "order_customer_id",
              "order_status").alias("concal_col"))
#ordersText.write.mode("overwrite").text("/FileStore/tables/cca175/test_data/retail_db/order_items/text_format")
#read_text_file = spark.read.text("/FileStore/tables/cca175/test_data/retail_db/order_items/text_format")
예제 #28
0
## Créer une nouvelle colonne dans la table borrow qui prend la valeur 1, si la durée d'emprunt est supérieur à 3 mois,  sinon 0
## SQL
print("**** Etat des emprunts")
spark.sql(""" select *, 
    case
        when (months_between(to_date(`return_time`, 'dd-MM-yyyy'),
                   to_date(`checkout_time`, 'dd-MM-yyyy')) > 3) then 1 else 0
    end as `Plus de 3 mois`
    from Borrow_SQL """).show()
## DSL
Borrow_export = Borrow\
    .withColumn("format",F.lit("dd-MM-yyyy"))\
    .withColumn("start",F.expr("to_date(`checkout_time`, format)"))\
    .withColumn("end",F.expr("to_date(`return_time`, format)"))\
    .withColumn("Plus de 3 mois",
                F.when(F.months_between(F.col('end'),F.col('start')) > 3,1)\
                .otherwise(0))\
    .drop('format','start','end')

## configuration des paths
import configparser
config = configparser.ConfigParser()
config.read('properties.conf')
path_to_output_data = config['BDA']['path']

## Exportation en csv
Borrow_export.toPandas().to_csv(path_to_output_data + "Borrow.csv")

## Déterminer les livres qui n’ont jamais été empruntés
## SQL
print("*** Les livres jamais empruntés")
예제 #29
0
print("Number of edits per month over all articles: ")
df_monthly.select("title", "yearmonth", "count").show()

min_date, max_date = df_monthly_ts.select(
    min_("yearmonth").cast("long"),
    max_("yearmonth").cast("long")).first()

data = [(min_date, max_date)]
df_dates = spark.createDataFrame(data, ["minDate", "maxDate"])
df_min_max_date = df_dates.withColumn(
    "minDate",
    col("minDate").cast("timestamp")).withColumn(
        "maxDate",
        col("maxDate").cast("timestamp"))

df_formatted_ts = df_min_max_date.withColumn("monthsDiff", f.months_between("maxDate", "minDate"))\
    .withColumn("repeat", f.expr("split(repeat(',', monthsDiff), ',')"))\
    .select("*", f.posexplode("repeat").alias("date", "val"))\
    .withColumn("date", f.expr("add_months(minDate, date)"))\
    .withColumn("yearmonth", f.concat(f.year("date"), f.lit('-'), format_string("%02d", f.month("date"))))\
    .select('yearmonth')

df_group_ts = df_groups.crossJoin(df_formatted_ts)

df_allts = df_group_ts.join(df_monthly, ['title', 'yearmonth'], how='left') \
    .orderBy('title', 'yearmonth').select('title', 'yearmonth', 'count')

df_allts.orderBy(desc('count')).show(100)

print('Calculate average edits per month for each article :')
window = Window.partitionBy("title").orderBy('yearmonth').rowsBetween(-1, 1)
예제 #30
0
# select date_sub(today, 5), date_add(today, 5) from dateTable

# Another task is to take a look at the difference between two dates. We can do this with datediff function
dateDF.withColumn('week_ago', date_sub(col('today'), 7))\
        .select(datediff(col('week_ago'), col('today'))).show(1)

dateDF.withColumn('week_ago', date_sub(col('today'), 7))\
        .selectExpr('datediff(week_ago, today)').show(1)

spark.sql('select datediff(week_ago, today) from (select *, date_sub(today, 7) as week_ago from dateTable)')


dateDF.select(
        to_date(lit('2016-01-01')).alias('start'),
        to_date(lit('2017-05-22')).alias('end'))\
        .select(months_between(col('start'), col('end'))).show(1)


dateDF.selectExpr(
        'to_date("2016-01-01") as start',
        'to_date("2017-05-22") as end')\
        .selectExpr(
                'months_between(start, end)',
                'datediff("2016-01-01", "2017-01-01")'
                ).show(1)

# We introduces the to_date function. The to_date function allows you to convert a string to a date, optionally with a specified format
# We specify our format in the Java SimpleDateFormat

spark.range(5).withColumn('date', lit('2017-01-01'))\
        .select(to_date(col('date'))).show(1)
예제 #31
0
# Alternatively, use the `to_timestamp` function:
from pyspark.sql.functions import to_timestamp
rides \
  .withColumn("date_time_fixed", to_timestamp("date_time", format="yyyy-MM-dd HH:mm")) \
  .select("date_time", "date_time_fixed") \
  .show(5)

# ### Example 8: Computing the age of each rider

# Use the `current_date` and `months_between` functions to compute the age of
# each rider:
from pyspark.sql.functions import current_date, months_between, floor
riders \
  .withColumn("today", current_date()) \
  .withColumn("age", floor(months_between("today", "birth_date") / 12)) \
  .select("birth_date", "today", "age") \
  .show(5)

# **Note:** Spark implicitly casts `birth_date` or `today` as necessary.  It is
# probably safer to explicitly cast one of these columns before computing the
# number of months between.

# ## Working with Boolean columns

# ### Example 10: Predefining a Boolean column expression

# You can predefine a Boolean column expression:
studentFilter = col("student") == 1
type(studentFilter)