예제 #1
0
def week_diff(end, start):
    """
         计算两个日期的相隔周数
    :param end:
    :param start:
    :return:
    """
    # date_trunc("week",date) 返回 所属周的周一的日期
    return datediff(date_trunc("week", col(end)), date_trunc(
        "week", col(start))) / 7
예제 #2
0
    def _avg(self, data, avg_over):
        """Average the data over a time period.

        Args:
            data (obj): data to average.
            avg_over (str): periodicity specification.

        Returns:
            Averaged Spark DataFrame

        """
        features = data.columns
        features.remove("datetime")

        new_df = data.withColumn(avg_over,
                                 F.date_trunc(avg_over, data["datetime"]))
        new_df = new_df \
            .groupBy(avg_over) \
            .agg(*[F.avg(feat) for feat in features]) \
            .orderBy(avg_over)

        for feat in features:
            new_df = new_df.withColumnRenamed("avg({})".format(feat), feat)

        new_df = new_df.withColumnRenamed(avg_over, "datetime")
        return new_df
예제 #3
0
    def make_set(self, setname, align_dates, feats):
        """Make a new Spark DataFrame by picking the relevant features from the different data.

        Users must specify the align_dates argument which makes all selected features from the
        different data to be time consistent with one another. i.e. make everything daily.

        Args:
            setname (str): Name of the new dataset.
            align_dates (str): Options for time consistency: "days", "weeks", "months"
            feats (dict): Dictionary of list of features and the data it orginates from: {data:[feat1, feat2, feat3]}

        """
        set_dict = {}
        for data, feat_list in feats.items():
            for feat in feat_list:
                set_dict[feat] = self.dicts[data][feat].select(
                    F.date_trunc(
                        align_dates,
                        self.dicts[data][feat].datetime).alias("datetime"),
                    "value")

        self.sets[setname] = self._dict2df(set_dict,
                                           sort=True,
                                           drop_nulls=False)

        # Truncated duplicates can exist in dataframe, so lets drop these samples.
        self.sets[setname] = self.sets[setname].dropDuplicates(["datetime"])
예제 #4
0
def aggregateSalesRevenueDeltaBacked(updatesDF, epochId):

    # Sum up the new incoming keys
    incomingSalesAggregateDF = (updatesDF.withColumn(
        "timestamp", f.date_trunc("minute", "timestamp")).groupBy(
            f.col("timestamp"),
            f.col("item_id")).agg(f.sum("sales").alias("sales")))

    targetTable = DeltaTable.forName(spark, "sw_db.delta_backed_state")
    # We merge the new sales with the already existing sales.
    # We simulate a watermark by only retrieving timestamp records greater than max seen timestamp - 5 minutes
    # Note that it is even better to partition the state by date if you have days worth of data, to skip over entire partitions,
    # when pushing down the predicate.
    mostRecentTimestamp = targetTable.toDF().select(
        f.max("timestamp").alias("max_timestamp")).head().max_timestamp
    watermarkTime = mostRecentTimestamp - timedelta(
        minutes=5) if mostRecentTimestamp else datetime.min

    (targetTable.alias("target").merge(
        incomingSalesAggregateDF.alias("source"), f"""
       target.item_id = source.item_id AND 
       target.timestamp = source.timestamp AND
       target.timestamp > cast('{watermarkTime}' AS TIMESTAMP) AND 
       source.timestamp > cast('{watermarkTime}' AS TIMESTAMP)
       """).whenMatchedUpdate(
            set={
                "sales": f.col("source.sales") + f.col("target.sales")
            }).whenNotMatchedInsertAll().execute())
 def get_daily_avg_v(self):
     data = self.data
     daily_agg = data.withColumn('date', functions.date_trunc('dd', data['timestamp']))
     daily_avg = daily_agg.groupby('date').avg('Volume BTC')
     result = daily_avg.select("date", "avg(Volume BTC)").orderBy('date')
     result.cache()
     return get_label_value(result.collect())
예제 #6
0
def save_comment_per_month(session, comments, base_name):
    counts_per_month = comments\
        .select(date_trunc("month", comments.created).alias("month"))\
        .groupby("month")\
        .count()
    counts_per_month.write.csv(
        "reddit/%s-%s.csv" % (base_name, session.sparkContext.applicationId)
    )
예제 #7
0
def county_reality_supply():
    # 各区县各档位该品规上周投放量
    try:
        print(f"{str(dt.now())} 各区县各档位该品规上周投放量")
        plm_item = get_plm_item(spark).select("item_id", "item_name")

        co_cust = get_co_cust(spark).select("cust_id", "sale_center_id",
                                            "cust_seg")

        area = get_area(spark)
        # com_id与city的映射关系
        city = area.dropDuplicates(["com_id"]).select("com_id", "city")
        # sale_center_id与区(list)的映射关系
        county = area.groupBy("sale_center_id") \
            .agg(f.collect_list("county").alias("county")) \
            .select("sale_center_id", "county")

        # 获取上周实际投放量
        # cust_item_spw = spark.sql(
        #     "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
        #     .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
        #     .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
        #     .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7)) \
        #     .withColumn("last_sun", f.date_add(col("last_mon"), 6)) \
        #     .where((col("begin_date") == col("last_mon")) & (col("end_date") == col("last_sun")))\
        #     .join(co_cust,"cust_id")

        cust_item_spw = spark.sql(
            "select com_id,cust_id,item_id,qty_allocco,begin_date,end_date from DB2_DB2INST1_SGP_CUST_ITEM_SPW") \
            .withColumn("begin_date", f.to_date(col("begin_date"), "yyyyMMdd")) \
            .withColumn("end_date", f.to_date(col("end_date"), "yyyyMMdd")) \
            .withColumn("last_mon", f.date_sub(f.date_trunc("week", f.current_date()), 7 * 4)) \
            .withColumn("last_sun", f.date_add(col("last_mon"), 6 + 7 * 3)) \
            .where((col("begin_date") >= col("last_mon")) & (col("end_date") <= col("last_sun")))\
            .join(co_cust,"cust_id")

        #需要计算的值的列名
        colName = "county_gauge_week_volume"
        result = cust_item_spw.groupBy("com_id","sale_center_id","cust_seg", "item_id") \
                                .agg(f.sum("qty_allocco").alias(colName))

        columns = [
            "com_id", "city", "sale_center_id", "county", "gears", "gauge_id",
            "gauge_name", "city", "gears_data_marker", colName
        ]
        result.withColumn("row", f.concat_ws("_", col("sale_center_id"),col("cust_seg"), col("item_id"))) \
            .withColumn("gears_data_marker", f.lit("4")) \
            .join(plm_item, "item_id") \
            .join(city, "com_id") \
            .join(county,"sale_center_id")\
            .withColumnRenamed("item_id","gauge_id")\
            .withColumnRenamed("item_name","gauge_name")\
            .withColumnRenamed("cust_seg","gears")\
            .foreachPartition(lambda x: write_hbase1(x, columns, hbase))
    except Exception:
        tb.print_exc()
예제 #8
0
def process_log_data(spark, input_data, output_data):
    """Loads data from s3 to spark
       creates user, songplays and time dataframe
       write back the user, songplays and time dataframe to s3 in parquet format"""

    # read log data file
    df = spark.read.json(input_data + 'log_data/2018/11/*.json')
    df.persist()

    # filter by actions for song plays
    df = df.filter(df_event.page == "NextSong")

    # extract columns for users table
    user_table = df.dropDuplicates(['userId']) \
                   .select(['userId','firstName','lastName','gender','level'])

    # write users table to parquet files
    user_table.write.parquet(output_data + 'udacity/user')

    # create start_time column from original timestamp column
    df = df.withColumn('start_time',
                       (df_event['ts'] / 1000).cast(types.TimestampType()))

    # read in song data to use for songplays table
    song_df = spark.read.json(input_data + "song_data/A/*/*/*.json")

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, \
                [song_df.artist_name == df.artist, song_df.duration == df.length, song_df.title == df.song],'inner') \
                .select(['start_time','userId','level','song_id','artist_id','sessionId','location','userAgent'])

    # Adding year and month for partitioning the dataframe
    songplays_table = songplays_table.withColumn('month',F.month('start_time')) \
                                     .withColumn('year',F.year('start_time'))

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy(
        'year', 'month').parquet(output_data + 'udacity/songplay')

    # create time column list
    time_column_list = ['start_time',
        F.hour('start_time').alias('hour'),\
        F.dayofmonth('start_time').alias('day'),\
        F.date_trunc('week', 'start_time').alias('week'),\
        F.month('start_time').alias('month'),\
        F.year('start_time').alias('year'), \
        F.dayofweek('start_time').alias('weekday')]

    # extract columns to create time table
    time_table = songplays_table.select(*time_columns)

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year',
                                 'month').parquet(output_data + 'udacity/time')
예제 #9
0
def compile_date_truncate(t, expr, scope, **kwargs):
    op = expr.op()

    try:
        unit = _time_unit_mapping[op.unit]
    except KeyError:
        raise com.UnsupportedOperationError(
            '{!r} unit is not supported in timestamp truncate'.format(op.unit))

    src_column = t.translate(op.arg, scope)
    return F.date_trunc(unit, src_column)
예제 #10
0
def comment_spoilers_per_subreddit(session, spoiler_comments, month=None):
    if month:
        comments_with_month = spoiler_comments\
            .withColumn("month", date_trunc("month", spoiler_comments.created))
        spoiler_comments = comments_with_month\
            .filter(comments_with_month.month == month)
    spoiler_counts_per_subreddit = spoiler_comments.groupby("subreddit")\
        .count()\
        .sort(desc("count"))
    spoiler_counts_per_subreddit.write.csv(
        "reddit/spoilers_per_subreddit-%s.csv" % session.sparkContext.applicationId
    )
    return spoiler_counts_per_subreddit
def predict():
    test = spark.read.csv(STREAMED_FILENAME, header=True, mode="DROPMALFORMED")
    test = test.select(F.col("timestamp"), cleanText(F.col("text")))
    messages = test.toPandas()
    times = test.select("timestamp")
    test = test.drop("timestamp")
    toxicTagger = PipelineModel.load(MODEL_PATH)
    predictions = toxicTagger.transform(test).select(F.col("prediction"))
    testIndex = predictions.withColumn("id", F.monotonically_increasing_id())
    timesIndex = times.withColumn("id", F.monotonically_increasing_id())
    tagged = timesIndex.join(testIndex, "id", "inner").drop("id")
    tagged = tagged.withColumn(
        "datetime",
        F.from_unixtime(F.unix_timestamp("timestamp", "yyyy-MM-dd HH:mm:ss")))
    tagged = tagged.select(F.col("datetime"), F.col("prediction"))
    minutes = tagged.withColumn(
        "timestamp", F.date_trunc("minute",
                                  F.col("datetime").cast("timestamp")))
    hours = tagged.withColumn(
        "timestamp", F.date_trunc("hour",
                                  F.col("datetime").cast("timestamp")))
    minutes = minutes.select(F.col("timestamp"), F.col("prediction"))
    hours = hours.select(F.col("timestamp"), F.col("prediction"))
    resultMinutes = minutes.groupBy("timestamp").mean("prediction").sort(
        F.col("timestamp").asc())
    resultHours = hours.groupBy("timestamp").mean("prediction").sort(
        F.col("timestamp").asc())
    resultMinutes = resultMinutes.na.drop(
        subset=["timestamp", "avg(prediction)"])
    resultHours = resultHours.na.drop(subset=["timestamp", "avg(prediction)"])
    resultMinutes, resultHours = resultMinutes.toPandas(
    ), resultHours.toPandas()
    resultMinutes['timestamp'] = pd.to_datetime(resultMinutes.timestamp)
    resultHours['timestamp'] = pd.to_datetime(resultHours.timestamp)
    resultMinutes.columns = ['timestamp', 'prediction']
    resultHours.columns = ['timestamp', 'prediction']
    messages['timestamp'] = pd.to_datetime(messages.timestamp)
    return resultMinutes, resultHours, messages
예제 #12
0
def to_friday(df, dt):
    '''Convert all days to Fridays

    Note that this is not forward mapping as Saturday and Sunday are mapped backward.
    This is fine since all the data are supposed to be in "business days"
    To be consistent over the weekends, use the timestamps'''
    cols = df.columns
    # Convert all days to Friday of the week
    df = df.withColumn('friday', F.date_add(F.date_trunc('week', dt), 4))
    # Keep only the last record in each week
    w = Window.partitionBy('friday').orderBy(F.col(dt).desc())
    df = df.withColumn('rn', F.row_number().over(w)).where(F.col('rn') == 1)
    df = df.drop(dt).withColumnRenamed('friday', dt)
    return df.select(*cols)
def create_vars(df, cells):
    # Loading variables
    df = df.withColumn("call_datetime", to_timestamp("call_datetime","dd/MM/yyyy HH:mm:ss"))
    #get call_date from call_datetime
    df = df.withColumn('call_date', df.call_datetime.cast('date'))
    
    # Recreate analysis variables
    df = df.join(cells, df.location_id == cells.cell_id, how = 'left').drop('cell_id')\
      .orderBy('msisdn', 'call_datetime')\
      .withColumn('region_lag', F.lag('region').over(user_window))\
      .withColumn('region_lead', F.lead('region').over(user_window))\
      .withColumn('call_datetime_lag', F.lag('call_datetime').over(user_window))\
      .withColumn('call_datetime_lead', F.lead('call_datetime').over(user_window))\
      .withColumn('hour_of_day', F.hour('call_datetime').cast('byte'))\
      .withColumn('hour', F.date_trunc('hour', F.col('call_datetime')))\
      .withColumn('week', F.date_trunc('week', F.col('call_datetime')))\
      .withColumn('month', F.date_trunc('month', F.col('call_datetime')))\
      .withColumn('constant', F.lit(1).cast('byte'))\
      .withColumn('day', F.date_trunc('day', F.col('call_datetime')))\
      .na.fill({'region' : missing_value_code ,
                'region_lag' : missing_value_code ,
                'region_lead' : missing_value_code })    

    return df
예제 #14
0
    def _avg(self, data, avg_over):
        features = data.columns
        features.remove("datetime")

        new_df = data.withColumn(avg_over,
                                 F.date_trunc(avg_over, data["datetime"]))
        new_df = new_df \
            .groupBy(avg_over) \
            .agg(*[F.avg(feat) for feat in features]) \
            .orderBy(avg_over)

        for feat in features:
            new_df = new_df.withColumnRenamed("avg({})".format(feat), feat)

        new_df = new_df.withColumnRenamed(avg_over, "datetime")
        return new_df
def execute(spark: SparkSession, log: logging, config: dict):
    log.info("extract")
    params = config['params']
    ps_conf = config['postgres']

    ts: datetime.datetime = params['ts']
    in_path = ts.strftime(params['in_path'])
    ts_from = config['ts_from']
    ts_to = config['ts_to']
    df = spark.read.csv(in_path, header=True, sep=';')
    df.select(
        F.col('FROM_PHONE_NUMBER'), F.col('TO_PHONE_NUMBER'),
        F.to_timestamp(df['START_TIME'], 'dd/MM/yyyy HH:mm:ss').alias('START_TIME'),
        F.col('CALL_DURATION').cast('long'), F.col('IMEI'), F.col('LOCATION')
    ).withColumn("TS", F.date_format(F.date_trunc("hour", "START_TIME"), "yyyy-MM-dd-HH"))
    df.write.partitionBy("TS").mode('append').format('hive').saveAsTable('task_02')
    df = spark.sql("select * from task_02 where TS >= {} AND TS < {}".format(ts_from, ts_to)).drop_duplicates()
    df.cache()
    ts = df.select("TS").rdd.map(lambda x: x[0]).first()
    # Number of call, total call duration.
    num_call = df.count()
    total_call_duration = list(df.select(F.sum(df['CALL_DURATION'])).first().asDict().values())[0]

    # Number of call in working hour (8am to 5pm)
    num_call_working_hour = df.filter("hour(START_TIME) >= 8 AND hour(START_TIME) <= 17").count()

    # Find the IMEI which make most call.
    imei_most = df.groupBy('IMEI').count().sort(F.col("count").desc()).first().asDict()

    # Find top 2 locations which make most call.
    locations = list(map(lambda x: x.asDict(), df.groupBy('LOCATION').count().sort(F.col("count").desc()).head(2)))

    rs = (ts, num_call, total_call_duration, num_call_working_hour, imei_most, locations)
    with get_postgres_cli(ps_conf) as ps_cli:
        with ps_cli.cursor() as cur:
            sql = """
            INSERT INTO metric_hour(
                ts, num_call, total_call_duration, 
                num_call_working_hour, imei_most, locations
            ) VALUES(%s, %s, %s, %s, %s, %s) 
            ON CONFLICT (ts) 
            DO UPDATE SET(
                num_call, total_call_duration, num_call_working_hour, imei_most, locations) = 
                (EXCLUDED.num_call, EXCLUDED.total_call_duration, EXCLUDED.num_call_working_hour
                 EXCLUDED.imei_most, EXCLUDED.locations)
            """
            cur.execute(sql, rs)
    def make_set(self, setname, align_dates, feats):
        #         feats={data:[feat]}
        set_dict = {}
        for data, feat_list in feats.items():
            for feat in feat_list:
                set_dict[feat] = self.dicts[data][feat].select(
                    F.date_trunc(
                        align_dates,
                        self.dicts[data][feat].datetime).alias("datetime"),
                    "value")

        self.sets[setname] = self._dict2df(set_dict,
                                           sort=True,
                                           drop_nulls=False)

        #         Truncated duplicates can exist in dataframe, so lets drop these samples.
        self.sets[setname] = self.sets[setname].dropDuplicates(["datetime"])
예제 #17
0
def execute(spark: SparkSession, log: logging, config: dict):
    log.info("extract")
    in_path = config['params']['in_path']
    out_path = config['params']['out_path']
    df = spark.read.csv(in_path, header=True).repartition(120, "PHONE_NUMBER").na.fill(
        {'DEACTIVATION_DATE': '9999-12-31'})

    log.info("transform")
    df_norm = df.sort(df.DEACTIVATION_DATE.desc()).groupby(
        ['PHONE_NUMBER']
    ).agg(
        F.collect_list(df['ACTIVATION_DATE']).alias('ACTIVATION_DATE'),
        F.collect_list(df['DEACTIVATION_DATE']).alias('DEACTIVATION_DATE')
    ).withColumn(
        'ACTUAL_ACTIVE_DATE',
        udf_actual_active_date(F.col('ACTIVATION_DATE'), F.col('DEACTIVATION_DATE'))
    ).select(['PHONE_NUMBER', 'ACTUAL_ACTIVE_DATE']).withColumn(
        "TS", F.date_format(F.date_trunc("month", "ACTUAL_ACTIVE_DATE"), "yyyy-MM"))

    log.info("load")
    df_norm.write.partitionBy("TS").parquet(out_path, mode="overwrite")
    spark.read.parquet(out_path)
예제 #18
0
def persist_weather(wthr):
    """ Create and cache dataframe of weather data.

    Round hourly temp to nearest 10 def F
    Divide rainfall into none, light (<=0.2 in), and heavy bins
    Round timestamp to nearest hour
    """
    wthr = wthr \
        .select('date', 'tdry', 'precip') \
        .filter(wthr.station == '72534014819') \
        .filter(wthr.report == 'FM-15') \
        .fillna({'precip':0})
    wthr = wthr \
        .withColumn('trnd', sf.round(wthr.tdry/10)*10) \
        .withColumn('prnd', sf.when(wthr.precip == 0, 0) \
                              .when(wthr.precip.between(0,0.2), 0.2) \
                              .otherwise(1)) \
        .withColumn('timernd', sf.date_trunc("Hour", wthr.date)) \
        .withColumn('day', (sf.date_format(wthr.date, 'u')).cast('int')) \
        .withColumn('hour', sf.hour(wthr.date)) \
        .drop('tdry', 'precip', 'date')
    wthr = wthr.cache()
    return wthr
예제 #19
0
def persist_cabs(cabs):
    """Filter, calculate columns, partition on start time and cache cab df."""
    cabs = cabs \
        .filter(cabs.trip_tot < 500) \
        .select(['taxi', 'start_str', 'comm_pick', 'dur',
                 'dist', 'fare', 'tip', 'extra']) \
        .fillna(0, subset=['fare', 'tip', 'extra', 'dur', 'dist'])
    cabs = cabs \
        .withColumn('startrnd', sf.date_trunc("Hour",
            sf.to_timestamp(cabs.start_str, 'MM/dd/yyyy hh:mm:ss aa'))) \
        .withColumn('total', cabs.fare + cabs.tip + cabs.extra) \
        .drop('start_str', 'fare', 'tip', 'extra')
    cabs = cabs \
        .withColumn('permile',
            sf.when(cabs.dist > 0.2, sf.least(cabs.total / cabs.dist, sf.lit(20))) \
              .otherwise(sf.lit(4))) \
        .withColumn('permin',
            sf.when(cabs.dur > 1, sf.least(cabs.total / (cabs.dur/60), sf.lit(5))) \
              .otherwise(sf.lit(1))) \
        .drop('dur', 'dist')

    cabs = cabs.repartition(200, 'startrnd') \
        .persist(StorageLevel.MEMORY_AND_DISK_SER)
    return cabs
예제 #20
0
def comment_spoilers_per_month_and_sub(session, spoiler_comments, limit_top_n=5):
    spoiler_counts = spoiler_comments\
        .select(date_trunc("month", spoiler_comments.created).alias("month"), "subreddit")\
        .groupby("month", "subreddit")\
        .count()\
        .rdd\
        .groupBy(lambda row: row["month"])\
        .cache()
    top_subs = spoiler_counts\
        .mapValues(lambda values: sorted(values, key=lambda row: row["count"], reverse=True))\
        .map(lambda pair: pair[1][:limit_top_n] if limit_top_n is not None else pair[1])\
        .flatMap(lambda rows: [row["subreddit"] for row in rows])\
        .distinct()\
        .collect()
    top_sub_counts = spoiler_counts\
        .map(lambda pair: [pair[0]] + subreddit_counts(top_subs, pair[1]))
    header = session.sparkContext.parallelize([["month"] + top_subs + ["others"]])
    header.map(lambda row: ",".join(row)).saveAsTextFile(
        "reddit/spoilers_per_month_and_sub_header-%s.csv" % session.sparkContext.applicationId
    )
    top_sub_counts.map(lambda row: ",".join(str(el) for el in row)).saveAsTextFile(
        "reddit/spoilers_per_month_and_sub-%s.csv" % session.sparkContext.applicationId
    )
    spoiler_counts.unpersist()
예제 #21
0
    def process_history_df_mom(self, df):
        """
        Process function for history data, generate result dataframe
        that contains week, number of create events
        """
        # There are two versions of API for CreateEvent of repository:
        # - One is        col("payload")['object'] == 'repository'
        # - Another is    col("payload")['ref_type'] == 'repository'
        # try:
        df_columns = df.columns
        df_first_record = df.first()
        # keyword = 'object' if 'object' in df_first_record['payload'] else 'ref_type'

        num_create_events_df = \
            df \
            .filter((col('payload')['ref_type'] == 'repository') | (col('payload')['object'] == 'repository')) \
            .filter((col('type') == 'CreateEvent') | (col('type') == 'Event'))

        # count the number of create events happened in one week (group by week)
        num_create_events_by_month_df = num_create_events_df.groupby(
            date_trunc('month',
                       df.created_at).alias('month_created_at')).count()

        # calculate the grawth rate of that day compare to last week
        # dulicated two dataframes, for each day in the first dataframe
        # find the number fo create events in the second dataframe
        # of a day that is 7 days before the day in the first dataframe
        # [df1] 2015-01-07 -> [df2] 2015-01-01 (7 days)
        num_create_events_by_month_df_1 = num_create_events_by_month_df.alias(
            'num_create_events_by_month_df_1')

        num_create_events_by_month_df_1 = \
            num_create_events_by_month_df_1 \
            .select(
                col('month_created_at').alias('month_created_at_1'),
                col('count').alias('count_1'))

        num_create_events_by_month_df_2 = num_create_events_by_month_df.alias(
            'num_create_events_by_month_df_2')

        num_create_events_by_month_df_2 = \
            num_create_events_by_month_df_2 \
            .select(
                col('month_created_at').alias('month_created_at_2'),
                col('count').alias('count_2'))

        joined_num_create_events_df = \
            num_create_events_by_month_df_1 \
            .withColumn(
                'last_week_month_created_at',
                date_trunc(
                    'month',
                    add_months(num_create_events_by_month_df_1.month_created_at_1, -1))) \
            .join(
                num_create_events_by_month_df_2,
                col('last_week_month_created_at')
                == col('month_created_at_2'),
                how='left_outer')

        joined_num_create_events_df.show()

        joined_num_create_events_df = joined_num_create_events_df.withColumn(
            'count_2', coalesce('count_2', 'count_1'))

        num_create_events_with_growth_rate_df = \
            joined_num_create_events_df \
            .withColumn(
                'monthly_increase_rate',
                ((joined_num_create_events_df.count_1 - joined_num_create_events_df.count_2) / joined_num_create_events_df.count_2)
            ) \
            .select(
                'month_created_at_1',
                'count_1',
                'monthly_increase_rate')

        num_create_events_with_growth_rate_df.show()

        return num_create_events_with_growth_rate_df
예제 #22
0
 def __call__(self, x: sql.Column):
     truncated = functions.date_trunc(self._frequency, x)
     return truncated.astype(self.dtype)
예제 #23
0
    F.round(((col("power_out") / 10085) * 100),
            1).alias("Percent Total Time Power Out"),
    F.round(((col("power_out") / col("total_measurements")) * 100),
            1).alias("Percent Monitored Time Power Out"),
    F.round(((col("power_ambiguous") / 10085) * 100),
            1).alias("Percent Total Time Power Ambiguous"),
    F.round(((col("power_ambiguous") / col("total_measurements")) * 100),
            1).alias("Percent Monitored Time Power Ambiguous"))

total_summary.show(20)
total_summary.repartition(1).write.format("com.databricks.spark.csv").mode(
    'overwrite').option("header", "true").save(args.result + '/weekly_summary')

daily_summary = pw_df_resampled.groupBy(
    "tx",
    F.date_trunc('day', "time").alias("date_day")).agg(
        F.avg("grid_voltage").alias("average_measured_voltage"),
        total_measurements_in_period.alias("total_measurements"),
        valid_voltage_condition.alias("valid_voltage"),
        under_voltage_condition.alias("under_voltage"),
        over_voltage_condition.alias("over_voltage"),
        measurements_with_one_sensor.alias("one_sensor"),
        measurements_with_two_sensors.alias("two_sensors"),
        measurements_with_three_sensors.alias("three_sensors"),
        measurements_with_zero_sensors.alias("zero_sensors"),
        power_out.alias("power_out"), power_on.alias("power_on"),
        power_ambiguous.alias("power_ambiguous"))

daily_summary = daily_summary.select(
    "tx", "date_day",
    F.round(((col("total_measurements") / 1440) * 100),
예제 #24
0
#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_time' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.avg("restore_time").alias("restore_times_mean"),*exprs)

pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_stddev", F.explode("restore_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'restore_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("restore_times_stddev").alias("restore_times_stddev"),*exprs)
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times_range", F.array_max("restore_times") - F.array_min("restore_times"))

#Okay now to effectively calculate SAIDI/SAIFI we need to know the sensor population
#join the number of sensors reporting metric above with our outage groupings
#then we can calculate the relative SAIDI/SAIFI contribution of each outage
pw_finalized_outages = pw_finalized_outages.join(pw_distinct_core_id, F.date_trunc("day", F.from_unixtime(pw_finalized_outages["outage_time"])) == F.date_trunc("day", pw_distinct_core_id["window_mid_point"]))

pw_finalized_outages = pw_finalized_outages.select("outage_time","restore_times_mean","cluster_size","sensors_reporting","outage_times","outage_times_range","outage_times_stddev","restore_times","restore_times_range","restore_times_stddev", "location")
pw_finalized_outages = pw_finalized_outages.withColumn("relative_cluster_size",col("cluster_size")/col("sensors_reporting"))

pw_finalized_with_string = pw_finalized_outages.withColumn("outage_times",F.to_json("outage_times"))
pw_finalized_with_string = pw_finalized_with_string.withColumn("restore_times",F.to_json("restore_times"))
pw_finalized_with_string = pw_finalized_with_string.withColumn("location",F.to_json("location"))

#okay we should save this
pw_finalized_with_string.repartition(1).write.format("com.databricks.spark.csv").mode('overwrite').option("header", "true").save(args.result + '/full_outage_list')




#We need to zero fill for every date and cluster size not already present in the dataset
예제 #25
0
df.select(F.date_format('dt', 'MM/dd/yyyy').alias('date')).show()

# COMMAND ----------

# you have the following dataframe. Truncate to get the first day of the year (alias "year"), as well as a second column which gets the first day of the month (alias "month"):
df = spark.createDataFrame([('1997-02-28 05:02:11', )], ['t'])
# Expected:
# +-------------------+-------------------+
# |               year|              month|
# +-------------------+-------------------+
# |1997-01-01 00:00:00|1997-02-01 00:00:00|
# +-------------------+-------------------+

# Answer
df.select(
    F.date_trunc('year', df.t).alias('year'),
    F.date_trunc('mon', df.t).alias('month')).show()

# COMMAND ----------

# You have this Pandas dataframe with the column "time" containing the dates between start_date and end_date.
# 1. Convert to Pyspark
# 2. Create a colunm "transactiondate" by casting the column "time" to date

import pandas as pd

time = pd.date_range('2018-01-01', '2018-01-03', freq='D')
dfp = pd.DataFrame(columns=['time'])
dfp['time'] = time
# Expected:
# +-------------------+---------------+
예제 #26
0
datasource1_null = datasource1.where(col("i3pl_chng_time").isNull())

datasource1_null = datasource1_null.withColumn(
    'i3pl_chng_time',
    F.col('i3pl_chng_time').cast(TimestampType()))

# #datasource1_null.select(['i3pl_chng_time']).show(10)

concatenated_data = datasource1_null.union(datasource1_notnull)

#concatenated_data.select(['i3pl_chng_time','source']).show(5)

concatenated_data = concatenated_data.withColumn(
    'DAY_OF_SHIP_RCV',
    F.lit(F.date_trunc("day",
                       concatenated_data['ACTUAL_RCPT_SHIP_DATE'])).cast(
                           TimestampType()))

#concatenated_data = concatenated_data.withColumn('row_status',F.lit('inserted'))

#concatenated_data.select(['row_status','i3pl_chng_time']).show(5)

#concatenated_data.dtypes

datasource2 = DynamicFrame.fromDF(concatenated_data, glueContext,
                                  "datasource2")

applymapping1 = ApplyMapping.apply(
    frame=datasource2,
    mappings=[
        ("document_type", "string", "document_type", "string"),
예제 #27
0
from pyspark.sql import functions
from spark_config import spark
"""
Script para retornar o montante (valor) transacionado pago por ISO / merchant, por mes 
"""
if __name__ == '__main__':

    df1 = spark.read.csv(path="../output/sanitize_transactions/",
                         header=True,
                         inferSchema=True,
                         sep=";")

    df2 = df1.where(df1.status == "paid").groupBy(
        functions.date_trunc("month",
                             df1.created_at).alias("transaction_month"),
        df1.iso_id, df1.merchant_id).agg(
            functions.sum(df1.valor).cast("decimal(15,2)").alias("valor"))

    df3 = df2.select(df2.iso_id, df1.merchant_id, df2.transaction_month,
                     df2.valor).orderBy(df2.iso_id, df2.transaction_month,
                                        df2.valor)

    df3.show(truncate=False)
    df3.printSchema()
예제 #28
0
# now explode the outage lists so that every line is a sensor involved in that outage and regroup by transformer and feeder
# then each outage maps to a number of sensors out under each transformer and feeder
# This gives the relative SAIFI contribution of each transformer in each outage
pw_outages_by_feeder = pw_finalized_outages.select("outage_time", "feeder_id")
pw_outages_by_feeder = pw_outages_by_feeder.withColumn("feeder_id",
                                                       F.explode("feeder_id"))
pw_outages_by_feeder = pw_outages_by_feeder.withColumn("size", F.lit(1))
pw_outages_by_feeder = pw_outages_by_feeder.groupBy(
    "outage_time", "feeder_id").agg(F.sum("size").alias("cluster_size"))
pw_outages_by_feeder = pw_outages_by_feeder.select(
    "outage_time", "cluster_size",
    col("feeder_id").alias("feeder_id_o"))
pw_outages_by_feeder.show()
pw_outages_by_feeder = pw_outages_by_feeder.join(
    pw_distinct_core_id_by_feeder,
    ((F.date_trunc("day", F.from_unixtime(
        pw_outages_by_feeder["outage_time"])) == F.date_trunc(
            "day", pw_distinct_core_id_by_feeder["window_mid_point"])) &
     (pw_outages_by_feeder["feeder_id_o"]
      == pw_distinct_core_id_by_feeder["feeder_id"])))

pw_outages_by_feeder = pw_outages_by_feeder.withColumn(
    "relative_cluster_size",
    col("cluster_size") / col("sensors_reporting"))
pw_outages_by_feeder = pw_outages_by_feeder.select("outage_time",
                                                   "relative_cluster_size",
                                                   "feeder_id")

pw_outages_by_tx = pw_finalized_outages.select("outage_time", "tx")
pw_outages_by_tx = pw_outages_by_tx.withColumn("tx", F.explode("tx"))
pw_outages_by_tx = pw_outages_by_tx.withColumn("size", F.lit(1))
pw_outages_by_tx = pw_outages_by_tx.groupBy("outage_time", "tx").agg(
예제 #29
0
def saveOnCassandra(rdd):
    if not rdd.isEmpty():
        #js = rdd.map(lambda x: json.loads(x))
        #a=rdd.collect()
        #print(a)
        #dict = rdd.collectAsMap()
        #rddString = str(rdd.collect())
        #print(rddString)
        #dict = loads(rddString)
        #print(dict)
        #rdd = rddR.map(.value.toString)

        df = spark.read.json(rdd)
        #df =
        #df = df
        #print(df.select("conn.ts").collect())
        #df.select(to_date(col('conn.ts')).alias('ts').cast("date")).show(10,False)
        #df = df.withColumn('conn.date', df['conn.ts'].cast('date'))

        #df1 = df.columns = ["conn"]
        #df.printSchema()
        #df2 = df.select()
        #print(df.select("conn.id.orig_h").alias("orig_h").collect())
        #df3 = df.groupBy(["conn.proto"]).count().orderBy('count', ascending=False).cache()
        #df.printSchema()
        #df.show()
        #df.printSchema()
        #df = loads(df).decode('utf-8')
        #df['conn'].show()
        #row = Row("conn")
        #df = rdd.map(row).toDF()
        #print(dict)
        #print(df.columns['conn'])

        if 'conn' in df.columns:
            '''
            #Proto
            try:
                df = df.na.drop(subset=["conn.proto"])
                dfProto = df.select(["conn.proto"])
                dfProtoAllFinal = dfProto.groupBy(["proto"]).count().show()
                #Proto count with timestamp
                dfPrTs = df.select(["conn.ts","conn.proto"])
                dfProtoTs = dfPrTs.withColumn("date", from_unixtime(dfPrTs['ts']))
                dfProtoHour = dfProtoTs.select('proto', date_trunc("Hour", "date").alias("hour"))
                dfProtoHourFinal = dfProtoHour.groupBy(["proto","hour"]).count()
                dfProtoDay = dfProtoTs.select('proto', date_trunc("day", "date").alias("day"))
                dfProtoDayFinal = dfProtoDay.groupBy(["proto","day"]).count()
            except:
                print("Erro no df de proto")
                pass
            #Service
            try:
                df = df.na.drop(subset=["conn.service"])
                dfService = df.select(["conn.service"])
                dfServiceTs = df.select(["conn.service", "conn.ts"])
                dfServiceDate = dfServiceTs.withColumn('date', from_unixtime(dfServiceTs['ts']))
                dfServiceHour = dfServiceDate.select("service", date_trunc("Hour", "date").alias("hour"))
                dfServiceHourFinal = dfServiceHour.groupBy(["service","hour"]).count()
                dfServiceDay = dfServiceDate.select('service', date_trunc("day", "date").alias("day"))
                dfServiceDayFinal = dfServiceDay.groupBy(["service","day"]).count()
                dfServiceHourFinal.show()
                dfServiceDayFinal.show()
            except:
                print("Erro no df de service")
                pass

            try:
                #Flow
                dfFlow = df.groupBy(["conn.`id.orig_h`","conn.`id.orig_p`","conn.`id.resp_h`", "conn.`id.resp_p`", "conn.proto"]).count()
                #Flow with timestamp
                dfFlowTs = df.select(["conn.ts","conn.`id.orig_h`","conn.`id.orig_p`","conn.`id.resp_h`", "conn.`id.resp_p`", "conn.proto"])
                dfFlowDate = dfFlowTs.withColumn("date", from_Orig_h`id.orig_h`","`id.orig_p`","`id.resp_h`", "`id.resp_p`", "proto"]).count()
                dfFlowHourFinal.show()
                #Flow per day
                dfFlowDayWithTsDate = dfFlowDate.withColumn("day", date_trunc("day", "date"))
                dfFlowDay = dfFlowDayWithTsDate.drop("ts","date")
                dfFlowDayFinal = dfFlowDay.groupBy(["day","`id.orig_h`","`id.orig_p`","`id.resp_h`", "`id.resp_p`", "proto"]).count()
                dfFlowDayFinal.show()
            except:
                print("Erro no df de Flow")
                pass
            '''

            #IP orig
            try:
                dfIpOrigTs = df.select(["conn.ts", "conn.`id.orig_h`"])
                dfIpOrigTsRenamed = dfIpOrigTs.withColumnRenamed(
                    "id.orig_h", "orig_h")
                dfIpOrigDate = dfIpOrigTsRenamed.withColumn(
                    "date", from_unixtime(dfIpOrigTsRenamed['ts']))
                dfIpOrigHour = dfIpOrigDate.select(
                    'orig_h',
                    date_trunc("Hour", "date").alias("hour"))
                dfIpOrigHourFinal = dfIpOrigHour.groupBy(["orig_h",
                                                          "hour"]).count()
                #dfIpOrigDay = dfIpOrigDate.select('orig_h', date_trunc("day", "date").alias("day"))
                #dfIpOrigDayFinal = dfIpOrigDay.groupBy(["orig_h","day"]).count()
                dfIpOrigHourFinal.show()
                #dfIpOrigDayFinal.show()
            except:
                print("Erro no df de Orig_h")
                pass

            #IP resp
            try:
                dfIpRespTs = df.select(["conn.ts", "conn.`id.resp_h`"])
                dfIpRespTsRenamed = dfIpRespTs.withColumnRenamed(
                    "id.resp_h", "resp_h")
                dfIpRespDate = dfIpRespTsRenamed.withColumn(
                    "date", from_unixtime(dfIpRespTsRenamed['ts']))
                dfIpRespHour = dfIpRespDate.select(
                    'resp_h',
                    date_trunc("Hour", "date").alias("hour"))
                dfIpRespHourFinal = dfIpRespHour.groupBy(["resp_h",
                                                          "hour"]).count()
                #dfIpRespDay = dfIpRespDate.select('orig_h', date_trunc("day", "date").alias("day"))
                #dfIpRespDayFinal = dfIpRespDay.groupBy(["orig_h","day"]).count()
                dfIpRespHourFinal.show()
                #dfIpRespDayFinal.show()
            except:
                print("Erro no df de resp_h")
                pass
pw_df = spark.read.jdbc(
            url = "jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch",
            table = query,
            predicates = predicates,
            properties={"user": args.user, "password": args.password, "driver":"org.postgresql.Driver"})

#if you have multiple saves below this prevents reloading the data every time
pw_df.cache()

#We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI
pw_distinct_core_id = pw_df.select("time","core_id")
pw_distinct_core_id = pw_distinct_core_id.groupBy(F.window("time", '10 days', '1 day')).agg(F.countDistinct("core_id"),F.array_distinct(F.collect_list("core_id")).alias("core_ids_reporting"))
pw_distinct_core_id = pw_distinct_core_id.withColumn("time", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end")))/2))
pw_distinct_core_id = pw_distinct_core_id.select(col("count(DISTINCT core_id)").alias("sensors_reporting"), "time","core_ids_reporting")
pw_distinct_core_id = pw_distinct_core_id.withColumn("day",F.date_trunc("day","time"))
pw_distinct_core_id = pw_distinct_core_id.select("day","sensors_reporting","core_ids_reporting")

pw_powered_locations = pw_df.select("time","is_powered","core_id","location_latitude","location_longitude")
pw_powered_locations = pw_powered_locations.withColumn("is_powered",col("is_powered").cast(IntegerType()))
pw_powered_locations = pw_powered_locations.groupBy("core_id",F.window("time",'4 minutes', '1 minute')).agg(F.avg("is_powered").alias("avg_power"),
                                                                                                            F.first("location_latitude").alias("location_latitude"),
                                                                                                            F.first("location_longitude").alias("location_longitude"))

pw_powered_locations = pw_powered_locations.filter(col("avg_power") == 1)
pw_powered_locations = pw_powered_locations.withColumn("time", col("window.start"))
pw_powered_locations = pw_powered_locations.select("time","core_id","location_latitude","location_longitude")
pw_powered_locations = pw_powered_locations.withColumn("loc_struct",F.struct("core_id","location_latitude","location_longitude"))
pw_powered_locations = pw_powered_locations.groupBy("time").agg(F.collect_list("loc_struct").alias("loc_struct"))
pw_powered_locations = pw_powered_locations.select(col("time").alias("minute"),"loc_struct")