Exemplos de current_date em Python, exemplos de pyspark.sql.functions.current_date em Python

Exemplo n.º 1

0

Exibir arquivo

    def baseline_model(self, df_processed):
        cond_dvs = F.when(col("dvs_p30days") > 400,
                          400).otherwise(col("dvs_p30days"))


        stats=df_processed\
        .withColumn("dvs_p30days",cond_dvs)\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .groupby(['grt_l2_cat_name'])\
        .agg(F.avg(col("dvs_p30days")).alias("avg_dvs_p30days")
            ,F.stddev(col("dvs_p30days")).alias("std_deal_view")
            ,F.round(F.avg(col("dvs_p30days")).cast('integer')).alias("avg_deal_view")
            ,F.max("dvs_p30days").alias("max_deal_view"))

        w = Window.partitionBy(F.col('consumer_id')).orderBy(
            F.col('normalized_dvs_p30days').desc())

        df_final=df_processed\
        .filter(col("recent_event_date")==1)\
        .filter(col("event_date")>=F.date_add(F.current_date(),-15))\
        .join(stats,on='grt_l2_cat_name')\
        .withColumn('normalized_dvs_p30days',(col('dvs_p30days')-col('avg_deal_view'))/col('std_deal_view'))\
        .withColumn('normalized_dvs_p30days_rank',F.row_number().over(w))


        df_micro=df_final\
        .filter(col('normalized_dvs_p30days')>=0)\
        .filter(col('normalized_dvs_p30days_rank')==1)\
        .filter(col('grt_l2_purchaser_14d')==0)

        return df_micro

Exemplo n.º 2

0

Exibir arquivo

Arquivo: datetimetransformer.py Projeto: o9812/Big-Data-2018

    def age_calculator(self, columns, age_colname):
        """
        @columns: a string of column name
        @age_colname: a string of the new column of age, e.g.'age'
        """
        assert isinstance(columns,
                          str), "Error: columns argument must be a string!"
        assert isinstance(
            age_colname,
            str), "Error: the name for age column argument must be a string!"

        df = self._df.select(
            columns,
            from_unixtime(unix_timestamp(columns,
                                         'MM/dd/yyy')).alias('with time'))
        df = df.withColumn('without time', df['with time'].cast(DateType()))
        df = df.withColumnRenamed(columns, 'tmp')
        df = df.withColumn(age_colname, (
            (datediff(current_date(), col('without time')).cast(FloatType())) /
            365).cast(IntegerType()))
        self._df = self._df.join(
            df,
            col(columns) == col('tmp'),
            'leftouter').drop('tmp').drop('with time').drop('without time')

        return self

Exemplo n.º 3

0

Exibir arquivo

Arquivo: DailyTrafficMISEngine.py Projeto: kaushu42/foxhound-security-solution

 def _preprocess(self):
     self._df = self._df[self._REQUIRED_COLUMNS]
     self._df = self._df.toDF(*self._HEADER_NAMES)
     self._df = self._df.withColumn("logged_datetime", to_timestamp(
         self._df.logged_datetime, 'yyyy/MM/dd'))
     self._df = self._df.withColumn("processed_datetime", current_date())
     self._df = self._df.withColumn('processed_datetime',
                                    to_timestamp(self._df.processed_datetime))
     self._df = self._df.withColumn(
         "source_port", self._df["source_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "destination_port", self._df["destination_port"].cast(IntegerType()))
     self._df = self._df.withColumn(
         "bytes_sent", self._df["bytes_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "bytes_received", self._df["bytes_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_received", self._df["packets_received"].cast(LongType()))
     self._df = self._df.withColumn(
         "packets_sent", self._df["packets_sent"].cast(LongType()))
     self._df = self._df.withColumn(
         "time_elapsed", self._df["time_elapsed"].cast(LongType()))
     self._df = self._df.withColumn(
         "repeat_count", self._df["repeat_count"].cast(LongType()))
     self._df = self._df.withColumn("count_events", lit(1))

Exemplo n.º 4

0

Exibir arquivo

    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(
                    date_format(
                        unix_timestamp(column, dateFormat).cast("timestamp"),
                        Format), current_date()) / 12), 4).alias(nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self

Exemplo n.º 5

0

Exibir arquivo

Arquivo: azureexpense.py Projeto: tdang2/databricks_explorer

def process_expense_file(file_path):
  tempDf = spark.read.load(file_path, format="csv", header="true", escape='"')
  tempDf = tempDf.filter(tempDf["Tags"].isNotNull()).select(
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("PreTaxCost"),
    col("ResourceType"),
    col("UsageDateTime"),
    split(col("Tags"), ",").alias("tags")
  )
  return tempDf.withColumn("costCenterNumber", lit(get_cost_center(tempDf["tags"])))\
          .withColumn("team", lit(get_team(tempDf["tags"])))\
          .withColumn("usageMonth", month(tempDf['UsageDateTime']))\
          .withColumn("usageYear", year(tempDf['UsageDateTime']))\
          .withColumn("processedDate", current_date())\
          .select(
    col("processedDate"),
    col("SubscriptionName"),
    col("ResourceGroup"),
    col("ResourceLocation"),
    col("ProductName"),
    col("ResourceType"),
    col("costCenterNumber"),
    col("team"),
    col("usageMonth"),
    col("usageYear"),
    col("UsageDateTime"),
    col("PreTaxCost")
  ).filter("costCenterNumber is not NULL")

Exemplo n.º 6

0

Exibir arquivo

    def put(self, df):
        """Writes data to final CSV file and stops spark session. 

        Arguments: 
            df[spark datafarme] -- spark dataframe that
        """
        try:
            self.logger.info('Begin putting data to disk // {}'.format(
                self.etl_id))

            jdbc_params = self._inst_jdbc_params()
            jdbc_params['dbtable'] = 'final_data'

            df = df.withColumn('process_date', F.current_date())

            df.write.format('jdbc').options(**jdbc_params)\
                .mode('overwrite')\
                .save()

            self.logger.info('Job complete // {}'.format(self.etl_id))

            self.spark_session.stop()

        except Exception as e:
            self.logger.error('{} // {}'.format(e, self.etl_id))

Exemplo n.º 7

0

Exibir arquivo

Arquivo: App.py Projeto: balaspark/webanalytics

    def getReadRDBMSDataFrame():

        from_format = "dd/MMM/yyyy:HH:mm:ss"
        to_format = 'dd-MM-yyyy'
        storeIncrementalValue(0, "ignore")

        df = spark.read.format("csv") \
            .option("inferSchema", True) \
            .option("header", True) \
            .load(argv[17])

        df = df.filter(df.createdt.isNotNull() & (df.createdt != "null"))

        result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \
            .withColumn("value",f.col("value").cast(f.StringType()))\
            .withColumn("score",f.col("score").cast(f.StringType()))\
            .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\
            .withColumn("status",f.col("status").cast(f.StringType()))\
            .withColumn("count",f.col("count").cast(f.StringType()))\
            .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\
            .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method",
                    "key", "count", "type", "site", "statuscode") \
            .withColumn("rdbms_current_date", f.current_date()) \
            .filter(f.col("id") > getIncrementalValue())

        result = result.withColumn(
            "id",
            f.col("id").cast(f.StringType())).na.fill("Not_Applicable")

        max_value = df.agg({"id": "max"}).collect()[0]
        storeIncrementalValue(max_value["max(id)"], "overwrite")

        result.printSchema()

        return result

Exemplo n.º 8

0

Exibir arquivo

Arquivo: DfTransf.py Projeto: mood-agency/optimus

    def ageCalculate(self, column, dateFormat, nameColAge):
        """
        This method compute the age of clients based on their born dates.
        :param  column      Name of the column born dates column.
        :param  dateFormat  String format date of the column provided.
        :param  nameColAge  Name of the new column, the new columns is the resulting column of ages.

        """
        # Check if column argument a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if dateFormat argument a string datatype:
        self.__assertTypeStr(dateFormat, "dateFormat")

        # Asserting if column if in dataFrame:
        assert column in self.__df.columns, "Error: Column assigned in column argument does not exist in dataFrame"

        # Output format date
        Format = "yyyy-MM-dd"  # Some SimpleDateFormat string

        exprs = format_number(
            mag(
                months_between(date_format(
                    unix_timestamp(column, dateFormat).cast("timestamp"), Format), current_date()) / 12), 4).alias(
            nameColAge)

        self.__df = self.__df.withColumn(nameColAge, exprs)

        self.__addTransformation()  # checkpoint in case

        return self

Exemplo n.º 9

0

Exibir arquivo

Arquivo: goods_supply.py Projeto: ZhiYinZhang/study

def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()

Exemplo n.º 10

0

Exibir arquivo

def get_crm_cust_log(spark):
    #crm_cust_log 变更记录表
    # change_type:  CO_CUST.STATUS  状态变更
    #               CO_CUST.CUST_SEG  档位变更
    crm_cust_log = spark.sql("select cust_id,change_type,change_frm,change_to,audit_date from DB2_DB2INST1_CRM_CUST_LOG") \
        .withColumn("audit_date", f.to_date("audit_date", "yyyyMMdd")) \
        .withColumn("day_diff",f.datediff(f.current_date(), col("audit_date"))) \
        .withColumn("change_frm", f.regexp_replace(col("change_frm"), "(zz)|(ZZ)", "31")) \
        .withColumn("change_to", f.regexp_replace(col("change_to"), "(zz)|(ZZ)", "31"))
    return crm_cust_log

Exemplo n.º 11

0

Exibir arquivo

Arquivo: ingest_transform_hcp_data.py Projeto: ashishforgit/additional_py_code

def create_oracle_output():
    df_hcp_match_final = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.path'))

    df_hcp_yesterdays_data = \
        spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.path')) \
            .filter(F.col('full_mk').isNotNull()) \
            .filter(F.col('prop_mk').isNotNull()) \
            .filter(F.col('pnr_mk').isNotNull())

    join_fields = ['lastmodified_datetime', 'full_mk', 'prop_mk', 'pnr_mk', 'match_level', 'new_rate_bucket']

    df_hcp_yesterday = df_hcp_yesterdays_data \
        .withColumn('match_flag', F.lit('Y')) \
        .join(df_hcp_match_final.select(join_fields),
              on=join_fields) \
        .persist()

    df_hcp_today = df_hcp_match_final \
        .withColumn('dl_last_modified_date', F.current_date()) \
        .withColumn('rk', F.lit(1)) \
        .withColumn('rk_f', F.lit(1)) \
        .join(df_hcp_yesterday.select(join_fields + ['match_flag']),
              on=join_fields,
              how='left') \
        .filter(F.col('match_flag').isNull())

    # TODO: Remove redundant fields and remnants of deduping keys which only contain a single value
    df_hcp_oracle = \
        cast_fields_for_oracle_output(df_hcp_yesterday) \
            .union(cast_fields_for_oracle_output(df_hcp_today)) \
            .withColumn('dwh_rk',
                        F.row_number().over(Window
                                            .partitionBy(F.col('pnr_mk'))
                                            .orderBy(F.col('pnr_mk').desc()))) \
            .filter(F.col('dwh_rk') == 1) \
            .drop('dwh_rk')

    # TODO: Investigate partitioning of Oracle data suitable for end-user
    # Dump to staging path since Spark cannot simultaneously read and write to the same path
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.table'))
    df_hcp_yesterday.unpersist()
    df_hcp_oracle = spark.read.parquet(config.get(config_set, 'hcp.txns.new.fin.f.d2.staging.path'))
    dump_dataframe(df=df_hcp_oracle,
                   path=config.get(config_set, 'hcp.txns.new.fin.f.d2.path'),
                   table_name=config.get(config_set, 'hcp.txns.new.fin.f.d2.table'))

    # Oracle table for EDM
    dump_dataframe(df=df_hcp_oracle
                   .withColumnRenamed('rate_bucket', 'original_rate_bucket')
                   .withColumnRenamed('new_rate_bucket', 'rate_bucket'),
                   path=config.get(config_set, 'hcp.txns.to.dwh.f.path'),
                   table_name=config.get(config_set, 'hcp.txns.to.dwh.f.table')
                   )

Exemplo n.º 12

0

Exibir arquivo

def adobe_traffic_by_channel(df_traffic_by_channel):
    return (df_traffic_by_channel.select(
        df_traffic_by_channel['id'], df_traffic_by_channel['createdat'],
        df_traffic_by_channel['reportsuiteid'],
        df_traffic_by_channel['reportsuitename'],
        df_traffic_by_channel['server'], df_traffic_by_channel['data'],
        df_traffic_by_channel['hour'], df_traffic_by_channel['channel'],
        df_traffic_by_channel['visits'], df_traffic_by_channel['orders'],
        df_traffic_by_channel['ordersserialized'],
        df_traffic_by_channel['revenue']).withColumn(
            'dataprocessamento', f.trunc(f.current_date(), 'month')))

Exemplo n.º 13

0

Exibir arquivo

Arquivo: helpers.py Projeto: 0xd5dc/amazon_review_valitator

def prepare_features(df):
    df = df.withColumn(
        'exclam',
        length('review_body') -
        length(regexp_replace('review_body', '\!', '')))
    df = df.withColumn('age',
                       datediff(current_date(), to_date(df['review_date'])))
    df = df.withColumn('review_length', length(df['review_body']))
    df = df.withColumn('helfulness', df['helpful_votes'] / df['total_votes'])
    df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
    select_cols = df.select(
        ['star_rating', 'helfulness', 'age', 'review_length',
         'label']).na.fill(0)
    return select_cols

Exemplo n.º 14

0

Exibir arquivo

def merge(spark, update, tableName, cols, key):
    """
    将DataFrame和delta表进行merge操作，insert操作要求DataFrame必须包含delta表所有的列(0.5版本)
    当我们使用merge操作更新/插入delta表其中几列时，指定在DataFrame中不存在的列的值为null。

    注：DataFrame中要写入delta表的列要和delta表一样
    :param spark,SparkSession实例
    :param update,spark DataFrame
    :param tableName,要更新的delta表
    """
    # 如果没有dt列，创建当前日期的dt列
    if "dt" not in cols:
        update = update.withColumn("dt", f.current_date())
        cols.append("dt")

    # 1.构建merge条件
    mergeExpr = f"origin.{key}=update.{key}"
    print(f"merge expression:{mergeExpr}")

    # 2.构建更新表达式
    updateExpr = {}
    for c in cols:
        updateExpr[c] = f"update.{c}"

    print(f"update expression:{updateExpr}")

    origin = DeltaTable.forPath(spark, tableName)
    origin_cols = origin.toDF().columns

    # 3.构建插入表达式
    insertExpr = {}
    for origin_col in origin_cols:
        if origin_col in cols:
            insertExpr[origin_col] = f"update.{origin_col}"
        else:
            # 不存在,插入null值(不是字符串)
            insertExpr[origin_col] = "null"

    print(f"insert expression:{insertExpr}")

    # for origin_col in origin_cols:
    #     if origin_col not in cols:
    #         update=update.withColumn(origin_col,f.lit(None))

    origin.alias("origin") \
        .merge(update.alias("update"), mergeExpr) \
        .whenMatchedUpdate(set=updateExpr) \
        .whenNotMatchedInsert(values=insertExpr) \
        .execute()

Exemplo n.º 15

0

Exibir arquivo

 def test_current_date(self, input_df, spark_session):
     """Substitute the cleansed values with the current date"""
     cleaning_definitions = dict(status=dict(
         elements=["active", "inactive"], default=F.current_date()))
     expected_output_df = spark_session.createDataFrame([
         Row(id=1, status="active"),
         Row(id=2, status=str(dt.date.today())),
         Row(id=3, status=str(dt.date.today())),
         Row(id=4, status="inactive"),
         Row(id=5, status=None),
         Row(id=6, status=str(dt.date.today())),
     ])
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definitions).transform(input_df)
     assert_df_equality(expected_output_df, output_df)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: columns.py Projeto: marcelomata/Optimus

        def _years_between(_new_col_name, attr):
            _date_format = attr[0]
            _col_name = attr[1]

            return F.format_number(
                F.abs(
                    F.months_between(
                        F.date_format(
                            F.unix_timestamp(
                                _col_name,
                                _date_format).cast("timestamp"),
                            format_dt),
                        F.current_date()) / 12), 4) \
                .alias(
                _new_col_name)

Exemplo n.º 17

0

Exibir arquivo

Arquivo: App.py Projeto: balaspark/webanalytics

    def getRandomUserWebApiDataFrame():

        final_list = []
        user_set = set()
        results_json = dict()
        randomuserurl = argv[7]
        ALLOWED_RANDOM_USER = int(argv[8])

        while True:
            resp = json.loads(
                urllib.request.urlopen(randomuserurl).read().decode('utf-8'))
            for randomuser in resp['results']:
                username = removeDigits(randomuser.get('user').get('username'))
                if username not in user_set:
                    user_set.add(username)
                    final_list.append(randomuser)
                if len(user_set) == ALLOWED_RANDOM_USER:
                    break

            if len(user_set) == ALLOWED_RANDOM_USER:
                break

        results_json["results"] = final_list
        json_df = spark.read.json(sc.parallelize([results_json]))
        users = json_df.withColumn("results", f.explode(f.col("results"))) \
            .withColumn("r_username", f.regexp_replace("results.user.username", '[0-9]', '')) \
            .withColumn("dob", f.from_unixtime(f.unix_timestamp(f.col('results.user.dob').cast('timestamp')), 'dd-MM-yyyy')) \
            .withColumn("user_current_date", f.current_date()) \
            .select("r_username", "results.user.gender", "results.user.name.title",
                    f.col("results.user.name.first").alias("firstname"),
                    f.col("results.user.name.last").alias("lastname"),
                    f.col("results.user.location.street").alias("location_street"),
                    f.col("results.user.location.city").alias("location_city"),
                    f.col("results.user.location.state").alias("location_state"),
                    f.col("results.user.location.zip", ).alias("location_zip"),
                    "results.user.email", "results.user.password", "results.user.salt", "results.user.md5",
                    "results.user.sha1", "results.user.sha256", "results.user.registered", "dob", "results.user.phone",
                    "results.user.cell", f.col("results.user.picture.large").alias("picture_large"),
                    f.col("results.user.picture.medium").alias("picture_medium"), "results.user.picture.thumbnail",
                    "user_current_date")

        return users

Exemplo n.º 18

0

Exibir arquivo

Arquivo: listings_data_cleaning.py Projeto: carlolY/Vancouver-Airbnb

def main(inp, outp):
    listings = spark.read.csv(inp, header=True, schema=schema)
    listings = listings.drop('host_id', 'host_name', 'neighbourhood_group',
                             'reviews_per_month',
                             'calculated_host_listings_count',
                             'availability_365')

    # listings.describe(['minimum_nights']).show()

    # Give a range to minimum_nights
    listings = listings.filter("minimum_nights > 0 AND minimum_nights <= 30")

    # Remove listings with last_review posted more than 2 years ago
    listings = listings.withColumn('curr', f.current_date())
    listings = listings.withColumn(
        'diff', f.datediff(f.col('curr'), f.col('last_review')))
    cleaned_listings = listings.filter(listings['diff'] < 730).drop(
        'curr', 'diff', 'number_of_reviews', 'last_review')
    ''' Coalescion is fine, as listing data only contains airbnb locations in Vancouver '''
    cleaned_listings.coalesce(1).write.csv(outp, mode='overwrite', header=True)

Exemplo n.º 19

0

Exibir arquivo

 def createAgeDateColumns(self, df):
     """Criar colunas de idade do cliente e data do pedido.
     
     Args:
         df: Dataframe
     
     Returns:
         Dataframe.
     """
     log(
         "INFO",
         f"[createAgeDateColumns] Criando colunas de idade e data do pedido."
     )
     return (df.withColumn(
         "data_pedido_date",
         date_format(
             from_unixtime(col('data_pedido')),
             "yyyy-MM-dd")).withColumn('idade', (months_between(
                 current_date(), col('data_nascimento_cliente')) / 12).cast(
                     IntegerType())))

Exemplo n.º 20

0

Exibir arquivo

def get_grade_cons_except():
    print(f"{str(dt.now())} 近30天零售户档位与周边消费水平不符")
    try:
        consume_level=get_all_consume_level(spark)\
                             .select("city","cust_id","consume_level")
        cust_cluster=get_cust_cluster(spark)

        #只展示目前有效的零售户    去掉 过去有效 现在无效的零售户
        valid_co_cust=get_valid_co_cust(spark).select("cust_id","sale_center_id")

        #近30天零售户档位  一个零售户近30天内多种档位，分别预警
        cust_seg=spark.sql("select * from DB2_DB2INST1_CO_CUST")\
             .where(f.datediff(f.current_date(),col("dt"))<=30)\
             .where(col("com_id").isin(["011114305","011114306","011114302"]))\
             .withColumn("cust_seg",f.regexp_replace("cust_seg","ZZ","31"))\
             .select("com_id","cust_id","cust_seg")\
             .join(valid_co_cust,"cust_id")\
             .dropDuplicates(["cust_id","cust_seg"])

        cols = {"value": "retail_rim_cons",
                "level1_code": "classify_level1_code",
                }
        cust_seg_cons=cust_seg.join(consume_level,"cust_id")\
                             .withColumnRenamed("consume_level",cols["value"])\
                             .join(cust_cluster,"cust_id")\
                             .select("city","cust_id","cust_seg","cluster_index","sale_center_id",cols["value"])

        result=except_grade(cust_seg_cons,cols,["city","cust_seg","cluster_index"],[3,4,5])\
                .withColumnRenamed("cust_seg","retail_grade")\
                .withColumnRenamed("mean","city_grade_cons_avg") \
                .withColumn(cols["level1_code"], f.lit("YJFL003"))

        values = [
                 "avg_orders_plus3", "avg_orders_minu3", "avg_orders_plus4",
                 "avg_orders_minu4", "avg_orders_plus5", "avg_orders_minu5",
                 "warning_level_code", "cust_id", "city", "sale_center_id",
                 "retail_grade", "city_grade_cons_avg"
                 ] + list(cols.values())
        result.foreachPartition(lambda x:write_hbase2(x,values,hbase))
    except Exception:
        tb.print_exc()

Exemplo n.º 21

0

Exibir arquivo

 def execute(self):
     """
     Method overrides the execute method of the abstract Task class. Method classifies the
     recipe based on the difficulty. Difficulty is classified based on the total time required
     for prepping and cooking the food.
     If sum(prep time + cooking time) > 1 Hour is as Hard.
     If sum(prep time + cooking time) > 30 minutes and less than an hour is as Medium
     If sum(prep time + cooking time) < 30 minutes as Easy
     As 'Unknown' for all other cases.
     :return: pyspark.sql.DataFrame
                     Returns a Spark's DataFrame with the transformation for the recipe.
     """
     return self.data_frame \
         .withColumn('cookTimeInMinutes',
                     when(lower(col('cookTime')).endswith('h'),
                          regexp_extract('cookTime', '\d+', 0) * 60)
                     .when(lower(col('cookTime')).endswith('m'),
                           regexp_extract('cookTime', '\d+', 0)).otherwise("")) \
         .withColumn('prepTimeInMinutes',
                     when(lower(col('prepTime')).endswith('h'),
                          regexp_extract('prepTime', '\d+', 0) * 60)
                     .when(lower(col('prepTime')).endswith('m'),
                           regexp_extract('prepTime', '\d+', 0)).otherwise("")) \
         .withColumn('difficulty',
                     when(expr('(cookTimeInMinutes + prepTimeInMinutes) > 60'), "Hard")
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) <= 60 and '
                                '(cookTimeInMinutes + prepTimeInMinutes) >= 30'), 'Medium')
                     .when(expr('(cookTimeInMinutes + prepTimeInMinutes) < 30'), "Easy")
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes > 60'), 'Hard')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes >= 30 and  prepTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('cookTimeInMinutes = "" and prepTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes > 60'), 'Hard')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes >= 30 and  cookTimeInMinutes <= 60'),
                           'Medium')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes < 30'), 'Easy')
                     .when(expr('prepTimeInMinutes = "" and cookTimeInMinutes = ""'), 'Unknown')
                     .otherwise("Unknown")) \
         .withColumn('date_of_execution', current_date()) \
         .drop('cookTimeInMinutes') \
         .drop('prepTimeInMinutes')

Exemplo n.º 22

0

Exibir arquivo

def get_co_co_01(spark, scope: list, filter="day"):
    """

    :param spark: SparkSession
    :param scope: [lower,upper]  日期过滤范围
    :param filter: 过滤类别  "day","week","month"  default:"day"
    :return:
    """
    # 获取co_co_01        unique_kind: 90 退货  10 普通订单    pmt_status:  0 未付款  1 收款完成
    co_co_01 = spark.sql(
        "select  * from DB2_DB2INST1_CO_CO_01 where (unique_kind = 90 and pmt_status=0) or (unique_kind=10 and pmt_status=1)") \
        .where(col("com_id").isin(cities)) \
        .where(col("born_date").rlike("\d{8}")) \
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_sum", col("qty_sum").cast("float")) \
        .withColumn("amt_sum", col("amt_sum").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_01 = co_co_01.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_01=co_co_01.withColumn("week_diff", week_diff("today", "born_date"))\
                         .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_01 = co_co_01.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_01

Exemplo n.º 23

0

Exibir arquivo

def get_co_co_line(spark, scope: list, filter="day"):
    """

        :param spark: SparkSession
        :param scope: [lower,upper]  日期过滤范围
        :param filter: 过滤类别  "day","week","month"  default:"day"
        :return:
        """
    co_co_line = spark.sql(
        "select * from DB2_DB2INST1_CO_CO_LINE") \
        .where(col("com_id").isin(cities))\
        .where(col("born_date").rlike("\d{8}"))\
        .withColumn("born_date", f.to_date("born_date", "yyyyMMdd")) \
        .withColumn("today", f.current_date()) \
        .withColumn("qty_ord", col("qty_ord").cast("float")) \
        .withColumn("price", col("price").cast("float"))\
        .withColumn("qty_rsn",col("qty_rsn").cast("float"))

    lower = scope[0]
    upper = scope[1]
    if lower >= 0 and upper >= lower:
        if filter in ["day", "week", "month"]:
            if filter == "day":
                co_co_line = co_co_line.withColumn("day_diff", f.datediff(col("today"), col("born_date"))) \
                    .where((col("day_diff") >= lower) & (col("day_diff") <= upper))

            elif filter == "week":
                co_co_line = co_co_line.withColumn("week_diff", week_diff("today", "born_date")) \
                    .where((col("week_diff") >= lower) & (col("week_diff") <= upper))
            else:
                co_co_line = co_co_line.withColumn("month_diff",
                                               month_diff_udf(f.year(col("born_date")), f.month(col("born_date")),
                                                              f.year(col("today")), f.month(col("today")))) \
                    .where((col("month_diff") >= lower) & (col("month_diff") <= upper))
        else:
            raise Exception("filter is 'day'/'week'/'month'")
    else:
        raise Exception("lower  must >=0 and upper  must >= lower")

    return co_co_line

Exemplo n.º 24

0

Exibir arquivo

def adobe_traffic_by_department(df_traffic_by_department, df_deparadepto):
    df_depara = (df_deparadepto.filter(
        (df_deparadepto['tabela'] == 'departamento')
        & (df_deparadepto['campo'] == 'iddepartamento')).select(
            df_deparadepto['de'], df_deparadepto['para']))

    return (df_traffic_by_department.join(
        df_depara,
        df_traffic_by_department['iddepartament'] == df_depara['de'],
        'left').select(df_traffic_by_department['id'],
                       df_traffic_by_department['createdat'],
                       df_traffic_by_department['reportsuiteid'],
                       df_traffic_by_department['reportsuitename'],
                       df_traffic_by_department['server'],
                       df_traffic_by_department['data'],
                       df_traffic_by_department['hour'],
                       df_traffic_by_department['iddepartament'],
                       f.trim(df_depara['para']).alias('namedepartment'),
                       df_traffic_by_department['visits'],
                       df_traffic_by_department['orders'],
                       df_traffic_by_department['ordersserialized'],
                       df_traffic_by_department['revenue']).withColumn(
                           'dataprocessamento',
                           f.trunc(f.current_date(), 'month')))

Exemplo n.º 25

0

Exibir arquivo

Arquivo: test_creator.py Projeto: goodwillpunning/hyperleaup

    def test_create(self):
        data = [
            (1001, "Jane", "Doe", "2000-05-01", 29.0, False),
            (1002, "John", "Doe", "1988-05-03", 33.0, False),
            (2201, "Elonzo", "Smith", "1990-05-03", 21.0, True),
            (2202, None, None, "1980-05-03", 45.0, False),  # Add a few nulls
            (2235, "", "", "1980-05-03", 43.0, True)
        ]
        df = get_spark_session().createDataFrame(
            data, ["id", "first_name", "last_name", "dob", "age", "is_temp"])

        # Ensure that a Hyper file can be created with date and timestamp columns
        df.withColumn("hire_date", current_date())
        df.withColumn("last_updated", current_timestamp())

        creator = Creator(df, 'employees', False)
        hyper_file_path = creator.create()
        assert (hyper_file_path == "/tmp/hyperleaup/employees/employees.hyper")
        tables = TestUtils.get_tables(
            "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (len(tables) == 1)
        num_rows = TestUtils.get_row_count(
            "Extract", "Extract", "/tmp/hyperleaup/employees/employees.hyper")
        assert (num_rows == 5)

Exemplo n.º 26

0

Exibir arquivo

Arquivo: pg_dump_to_parquet.py Projeto: mozilla/mozaggregator2bq

def main(input_dir, output_dir):
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.session.timeZone", "UTC")

    df = read_pg_dump(spark, input_dir)

    # apply some other transformations to obtain the final table
    result = (
        df.withColumn("parts", F.split("table_name", "_")).withColumn(
            "dimension", F.from_json("dimension", DIMENSION_SCHEMA)).
        select(
            F.current_date().alias("ingest_date"),
            "aggregate_type",
            "ds_nodash",
            # parts[:2] form aggregate_type, this is parsed from the filename
            F.col("parts").getItem(2).alias("channel"),
            F.col("parts").getItem(3).alias("version"),
            # parts[-1] is ds_nodash, this is parsed from the filename
            "dimension.*",
            from_pg_array_udf("aggregate"),
        ))
    # -RECORD 0-----------------------------------------------------
    # aggregate_type | submission_date
    # channel        | nightly
    # version        | 43
    # ds_nodash      | 20191201
    # os             | Windows_NT
    # child          | false
    # label          |
    # metric         | A11Y_INSTANTIATED_FLAG
    # osVersion      | 10.0
    # application    | Firefox
    # architecture   | x86
    # aggregate      | "[0, 2, 0, 2, 2]"

    result.repartition(1).write.parquet(output_dir, mode="overwrite")

Exemplo n.º 27

0

Exibir arquivo

Arquivo: source_data_loading.py Projeto: Kunalred/UnileverData

            jdbc_params = {
                "url": ut.get_mysql_jdbc_url(app_secret),
                "lowerBound": "1",
                "upperBound": "100",
                "dbtable": src_conf["mysql_conf"]["dbtable"],
                "numPartitions": "2",
                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            txn_df = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load() \
                .withColumn("ins_dt", functions.current_date())
            txn_df.show()

            txn_df.write \
                .mode('overwrite')\
                .partitionBy("INS_DT")  \
                .parquet("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/staging/SB")

            print("\nWriting SB data to S3 <<")

        elif src == "OL":
            print("\nReading OL data from sftp >>")
            txn_df2 = spark.read.format("com.springml.spark.sftp") \
                .option("host", app_secret["sftp_conf"]["hostname"]) \
                .option("port", app_secret["sftp_conf"]["port"]) \
                .option("username", app_secret["sftp_conf"]["username"]) \

Exemplo n.º 28

0

Exibir arquivo

Arquivo: source_data_loading.py Projeto: muli2487/datamart

                "partitionColumn": src_conf["mysql_conf"]["partition_column"],
                "user": app_secret["mysql_conf"]["username"],
                "password": app_secret["mysql_conf"]["password"]
            }
            # print(jdbcParams)

            # use the ** operator/un-packer to treat a python dictionary as **kwargs
            print(
                "\nReading data from MySQL DB using SparkSession.read.format(),"
            )
            txnDF = spark\
                .read.format("jdbc")\
                .option("driver", "com.mysql.cj.jdbc.Driver")\
                .options(**jdbc_params)\
                .load()\
                .withColumn('ins_date', current_date())

            # add the current date to the above df and then you write it
            txnDF.show()
            txnDF\
                .write.mode("append")\
                .partitionBy('ins_date') \
                .format("parquet") \
                .save(src_loc)

        elif src == 'OL':
            # put the code here to pull rewards data from SFTP server and write it to s3
            ol_txn_df = spark.read\
                .format("com.springml.spark.sftp")\
                .option("host", app_secret["sftp_conf"]["hostname"])\
                .option("port", app_secret["sftp_conf"]["port"])\

Exemplo n.º 29

0

Exibir arquivo

Arquivo: Structured_APIs-Chapter_6_Working_with_Different_Types_of_Data.py Projeto: orekoya/spark-demo

def color_locator(column, color_string):
  return locate(color_string.upper(), column)\
          .cast("boolean")\
          .alias("is_" + color_string)
selectedColumns = [color_locator(df.Description, c) for c in simpleColors]
selectedColumns.append(expr("*")) # has to a be Column type

df.select(*selectedColumns).where(expr("is_white OR is_red"))\
  .select("Description").show(3, False)


# COMMAND ----------

from pyspark.sql.functions import current_date, current_timestamp
dateDF = spark.range(10)\
  .withColumn("today", current_date())\
  .withColumn("now", current_timestamp())
dateDF.createOrReplaceTempView("dateTable")


# COMMAND ----------

from pyspark.sql.functions import date_add, date_sub
dateDF.select(date_sub(col("today"), 5), date_add(col("today"), 5)).show(1)


# COMMAND ----------

from pyspark.sql.functions import datediff, months_between, to_date
dateDF.withColumn("week_ago", date_sub(col("today"), 7))\
  .select(datediff(col("week_ago"), col("today"))).show(1)

Exemplo n.º 30

0

Exibir arquivo

Arquivo: _date.py Projeto: mindis/SparkLearning-1

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, to_date, current_date, \
    current_timestamp, date_add, date_sub, datediff, months_between, to_timestamp, hour

if __name__ == '__main__':
    spark = SparkSession.builder.appName("learning").master(
        "local").getOrCreate()

    spark.range(5).withColumn('date', to_date(lit('2019-01-01'))).show()

    spark.read.jdbc

    spark.range(5)\
         .select(current_date().alias('date'), current_timestamp().alias('timestamp'))\
         .select(date_add(col('date'), 1), date_sub(col('timestamp'), 1)).show()

    spark.range(5).select(to_date(lit('2019-01-01')).alias('date1'),
                          to_date(lit('2019-01-05')).alias('date2'))\
                  .select(datediff(col('date2'), col('date1')),
                          months_between(col('date2'), col('date1'))).show()

    spark.range(5).withColumn('date', to_date(
        lit('2019-XX-XX'))).show()  #No emite excepcion

    spark.range(5).withColumn('date_comp1', to_date(lit('2019-01-01')) > to_date(lit('2019-01-02'))) \
                  .withColumn('date_comp2', to_date(lit('2019-01-01')) > to_timestamp(lit('2019-01-02'))) \
        .withColumn('date_comp3', to_date(lit('2019-01-01')) > "2019-01-02") \
                  .withColumn('date_comp3', to_date(lit('2019-01-01')) > "'2019-01-02'").show()

    spark.range(5).select(current_timestamp().alias("timestamp")).select(
        hour(col('date')))

Exemplo n.º 31

0

Exibir arquivo

from pyspark.sql.functions import initcap, lower, upper, ltrim, rtrim, trim, lpad, rpad
df.select(initcap(col("Description"))).show(5)
df.select(col("Description"), initcap(col("Description")),
          lower(col("Description")), upper(col("Description"))).show(5)

string_with_space = "     hello     "

df.select(ltrim(lit(string_with_space)), rtrim(lit(string_with_space)),
          trim(lit(string_with_space))).show()

#regular expressions

#working with dates, timestamps
from pyspark.sql.functions import current_date, current_timestamp, date_add, date_sub, datediff, months_between, to_date, to_timestamp

dateDF = spark.range(10).withColumn("today", current_date()).withColumn(
    "now", current_timestamp())
dateDF.show()

dateDF.select(
    date_add(col("today"), 5).alias("today+5"),
    date_sub(col("today"), 5).alias("today-5")).show()

#convert string to date, default format is 'YYYY-MM-DD'
spark.range(1).select(
    to_date(lit("2019-02-01")).alias("start_date"),
    to_date(lit("2019-03-06")).alias("end_date")).select(
        datediff(col("start_date"), col("end_date"))).show()

cleanDateDF = spark.range(1).withColumn("date1", current_date())
date_format = 'YYYY-MM-DD'