示例#1
0
def extractData(days):
    my_window = Window.partitionBy("vin").orderBy("normaltime")
    my_next_window = Window.partitionBy("vin").orderBy(desc("normaltime"))

    dfSchema = build_schema("conv")
    df = None
    for day in days:
        df_1s = load1sDataPerDay(day, dfSchema)
        df_5s = load5sDataPerDay(day)

		if not (df_1s is None or df_5s is None):
			df_tmp = df_1s.join(df_5s, ["vin", "normaltime"], "inner").withColumn("normaltime", to_timestamp(col("normaltime"), normaltimeFormat))
			df_tmp = df_tmp.withColumn("next_normaltime", F.lag(df_tmp.normaltime).over(my_next_window))
			df_tmp = df_tmp.withColumn("prev_normaltime", F.lag(df_tmp.normaltime).over(my_window))
			df_tmp = df_tmp.withColumn("prev_diff", F.when(F.isnull(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long")), 1000).otherwise(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long")))
			df_tmp = df_tmp.withColumn("next_diff", F.when(F.isnull(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long")), 1000).otherwise(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long")))
			df_tmp = df_tmp.where("(prev_diff >= 60 or next_diff >= 60) and tel_latitudedeg > 0 and tel_longitudedeg > 0")
			print('{} starting/ending rows  ..'.format(df_tmp.count()))
			if df is None:
				df = df_tmp
			else:
				df = df.union(df_tmp)
		print('{} processing ..'.format(day))
		if not (df is None):
			print('{} rows loaded ..'.format(df.count()))
    def replaceNull(self, value, subset=None):
        isDate = False
        isTimestamp = False

        try:
            if isinstance(value, str):
                date_obj = datetime.datetime.strptime(
                    value, "%Y-%m-%d")  #YYYY-MM-DD format e.g "2020-10-01"
                isDate = True
        except ValueError:
            isDate = False

        try:
            if isinstance(value, str):
                date_obj = datetime.datetime.strptime(
                    value, "%Y-%m-%dT%H:%M:%S"
                )  #YYYY-MM-DDThh:mm:ss format e.g "2020-10-01T19:50:06"
                isTimestamp = True
        except ValueError:
            isTimestamp = False

        if isDate and subset is not None:
            dateCol = (x for x in self.inputSchema
                       if str(x.dataType) == "DateType" and x.nullable == True
                       and x.name in subset)
            for x in dateCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isDate and subset is None:
            dateCol = (x for x in self.inputSchema
                       if str(x.dataType) == "DateType" and x.nullable == True)
            for x in dateCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isTimestamp and subset is not None:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType"
                     and x.nullable == True and x.name in subset)
            for x in tsCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isTimestamp and subset is None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.nullable == True)
            for x in tsCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        else:
            self.inputDf = self.inputDf.fillna(value, subset)

        return self.inputDf
示例#3
0
def preprocess(df):
    """Drops null values in customer_id and global_product_id and
       calculates purchase count of each product by customer.
    Args:
       df (DataFrame): raw data from CSV file
    Returns:
       DataFrame with columns customer_id, global_product_id, count
    """
    logger.info('Raw data contains {:,} rows'.format(df.count()))
    df = (df.select(
        col('customer_id').cast('integer'),
        col('global_product_id').cast('integer')).filter(
            ~isnull('customer_id') & ~isnull('global_product_id')))
    df.cache()
    logger.info('Cleaned data contains {:,} rows'.format(df.count()))

    df = df.groupBy('customer_id', 'global_product_id').count()
    logger.info('Customer product purchases contains {:,} rows'.format(
        df.count()))
    logger.info('Customer product purchases contains {:,} customers'.format(
        df.select('customer_id').distinct().count()))
    logger.info('Customer product purchases contains {:,} products'.format(
        df.select('global_product_id').distinct().count()))
    logger.info('Customer total product purchases summary statistics:')
    df.select('count').describe().show()
    return df
def add_has_user_page(wmhist, page_history, remember_dict):
    user_pages = page_history.filter(f.col("page_namespace_historical")==2)
    user_pages = user_pages.select([f.col("wiki_db").alias("up_wiki_db"),
                                    f.col("page_id").alias("user_page_id"),
                                    f.col("page_title_historical").alias("user_page_title"),
                                    f.col("page_first_edit_timestamp").alias("user_page_first_edit"),
                                    f.col("start_timestamp").alias("user_page_start_timestamp"),
                                    f.col("end_timestamp").alias("user_page_end_timestamp")
])

    user_pages = user_pages.filter( (f.col("page_is_redirect") == False)
                                    & (f.col("page_is_deleted") == False))

    join_cond = [wmhist.wiki_db == user_pages.up_wiki_db,
                 wmhist.event_user_text_historical == user_pages.user_page_title,
                 wmhist.event_timestamp > user_pages.user_page_first_edit,
                 wmhist.event_timestamp >= user_pages.user_page_start_timestamp,
                 ((wmhist.event_timestamp < user_pages.user_page_end_timestamp) | f.isnull(f.col("user_page_end_timestamp")))]

    

    wmhist = wmhist.join(user_pages, on = join_cond, how="left_outer")

    wmhist = wmhist.withColumn("has_user_page", f.isnull(wmhist.user_page_id) == False)

    return((wmhist, remember_dict))
示例#5
0
def cleanup_no_activity_rows(df, activity_field='activity_dt'):
    """
    each df that was joined can produce "empty" activity rows for
    clients that didn't have activity in that df's activity
    this can blow up so you have "empty" rows for clients that
    did have activity in one of the dfs. this just cleans it up
    so that (client_id, branch) combos that have zero activity
    only get one row
    """

    df_has_activity = df.filter("{} is not null".format(activity_field))\
                        .select([
                            F.col('client_id').alias('client_id_temp'),
                            F.col('branch').alias('branch_temp')
                                ]).distinct()

    df = df.join(df_has_activity,
                 F.isnull(F.col('activity_dt'))
                 & (F.col('client_id') == F.col('client_id_temp'))
                 & (F.col('branch') == F.col('branch_temp')),
                 how='left')
    df = df.filter(F.isnull(F.col('client_id_temp')))
    df = df.drop('client_id_temp').drop('branch_temp')

    return df
示例#6
0
    def _get_telemetry_sanity_check_metrics(self, enrollments, df):
        """Return aggregations that check for problems with a client."""

        # TODO: Once we know what form the metrics library will take,
        # we should move the below metric definitions and documentation
        # into it.

        if dict(df.dtypes).get('experiments') != 'map<string,string>':
            # Not all tables have an experiments map - can't make these checks.
            return []

        return [

            # Check to see whether the client_id is also enrolled in other branches
            # E.g. indicates cloned profiles. Fraction of such users should be
            # small, and similar between branches.
            F.max(
                F.coalesce((df.experiments[self.experiment_slug] !=
                            enrollments.branch).astype('int'),
                           F.lit(0))).alias('has_contradictory_branch'),

            # Check to see whether the client_id was sending data in the conversion
            # window that wasn't tagged as being part of the experiment. Indicates
            # either a client_id clash, or the client unenrolling. Fraction of such
            # users should be small, and similar between branches.
            F.max(
                F.coalesce(
                    (~F.isnull(df.experiments)
                     & F.isnull(
                         df.experiments[self.experiment_slug])).astype('int'),
                    F.lit(0))).alias('has_non_enrolled_data'),
        ]
示例#7
0
def handle_missing_get_indicator_column(df, input_column, expected_type):
    """Helper function used to get an indicator for all missing values."""
    dcol = df[input_column].cast(expected_type)
    if isinstance(expected_type, StringType):
        indicator = sf.isnull(dcol) | (sf.trim(dcol) == "")
    else:
        indicator = sf.isnull(dcol) | sf.isnan(dcol)
    return indicator
示例#8
0
def task_1(data_io, review_data, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    asin_column = 'asin'
    overall_column = 'overall'
    # Outputs:
    mean_rating_column = 'meanRating'
    count_rating_column = 'countRating'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    data = review_data.groupBy(F.col(asin_column)).agg(
        F.avg(F.col(overall_column)).alias(mean_rating_column),
        F.count("*").alias(count_rating_column))

    merged = product_data.join(data, on=asin_column, how='left')

    aggregate_func = merged.agg(
        F.count("*"), F.avg(F.col(mean_rating_column)),
        F.variance(F.col(mean_rating_column)),
        F.sum(F.isnull(F.col(mean_rating_column)).astype("int")),
        F.avg(F.col(count_rating_column)),
        F.variance(F.col(count_rating_column)),
        F.sum(F.isnull(F.col(count_rating_column)).astype("int"))).collect()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    # Calculate the values programmaticly. Do not change the keys and do not
    # hard-code values in the dict. Your submission will be evaluated with
    # different inputs.
    # Modify the values of the following dictionary accordingly.
    res = {
        'count_total': None,
        'mean_meanRating': None,
        'variance_meanRating': None,
        'numNulls_meanRating': None,
        'mean_countRating': None,
        'variance_countRating': None,
        'numNulls_countRating': None
    }
    # Modify res:

    res['count_total'] = aggregate_func[0]
    res['mean_meanRating'] = aggregate_func[1]
    res['variance_meanRating'] = aggregate_func[2]
    res['numNulls_meanRating'] = aggregate_func[3]
    res['mean_countRating'] = aggregate_func[4]
    res['variance_countRating'] = aggregate_func[5]
    res['numNulls_countRating'] = aggregate_func[6]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_1')
    return res
 def check(decimal_columns):
     precision=38
     scale=10
     if type(decimal_columns)==tuple:
         name=decimal_columns[0]
         precision=decimal_columns[1]
         scale=decimal_columns[2]
     else:
         name=decimal_columns
     return ~isnull(col(name))&isnull(col(name).cast(DecimalType(precision,scale)))
示例#10
0
 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df
示例#11
0
def union_label_feature(labelDF, itemprofileDF):

    DF1 = labelDF.join(
        itemprofileDF,
        labelDF.item2 == itemprofileDF.pid, "left_outer").withColumnRenamed(
            "value",
            "features2").drop("pid").where(F.isnull('features2') == False)
    DF2 = DF1.join(
        itemprofileDF,
        DF1.item1 == itemprofileDF.pid, "left_outer").withColumnRenamed(
            'value',
            'features1').drop("pid").where(F.isnull('features1') == False)
    return DF2
示例#12
0
def add_duration_id(spark, df, logger):
    """Calculate the visitduration_id by splitting the visit duration into buckets"""
    durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate"))
    ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22,
                                        29, float('Inf') ],
                                        inputCol="duration_days",
                        outputCol="ddbuckets")
    ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df)
    dur_id_df = ddbuck_df.withColumn("visitduration_id",
                                   when(isnull(col("arrdate")) |
                                        isnull(col("depdate")), 999)\
                                   .otherwise(col("ddbuckets").cast(IntegerType()))
                                 )
    logger.info("Added duration_id")
    return dur_id_df
def test_null(frame, to_check):
    frame = frame.toDF()
    test_result = [
        x for x in to_check
        if frame.select(count(when(isnull(x), x))).collect()[0][0] > 0
    ]
    return len(test_result) > 0, ", ".join(test_result) + "contain null values"
示例#14
0
def get_nulls(df):
    """
    https://github.com/paaarx
    Returns a DataFrame with null count.

    For every column in DataFrame, analyzes each row to search for null values
    and count how many are found.
    Only columns with null values will be returned.

    Parameters:
        df (DataFrame): The DataFrame to be analyzed.

    Returns:
        DataFrame: DataFrame with columns and null count.
    """

    expression = []

    for column in df.columns:
        expression.append(count(when(isnull(column), column)).alias(column))

    df_with_nulls = df.select(expression)

    column_list = []

    for key, value in df_with_nulls.collect()[0].asDict().items():
        if value > 0:
            column_list.append(key)

    return df_with_nulls.select(column_list)
示例#15
0
def transformCsvToDbDataFile(csvFile):
    """
    Prepare a suitable format to be saved into Postgres database.
    General rules are
    1. Look for minimum nights >= 5 & maximum nights <= 30
    2. Look amenities such as Wifi, TV, and Internet
    3. Replace the $ signs in price & weekly_price with empty string
    4. Convert minimum nights and maximum nights to Integer
    5. Convert price & weekly_price to Double
    6. When weekly_price is null then set it as 0
    """

    return csvFile.select("id", "listing_url", "amenities", "minimum_nights",
                          "maximum_nights", "price", "weekly_price", "city", "country")\
        .filter(csvFile["amenities"].contains("Internet"))\
        .filter(csvFile["amenities"].contains("Wifi"))\
        .filter(csvFile["amenities"].contains("TV"))\
        .filter(csvFile["price"].contains("$"))\
        .withColumn("id", csvFile["id"].cast(IntegerType()))\
        .withColumn("minimum_nights", csvFile["minimum_nights"].cast(IntegerType()))\
        .withColumn("maximum_nights", csvFile["maximum_nights"].cast(IntegerType()))\
        .withColumn("price", F.regexp_replace("price", "\\$", "").cast(DoubleType()))\
        .withColumn("weekly_price", F.when(F.isnull(csvFile["weekly_price"]), 0.0)
                    .otherwise(F.regexp_replace("weekly_price", "\\$", "").cast(DoubleType())))\
        .where(csvFile.colRegex("minimum_nights") >= 5)\
        .where(csvFile.colRegex("maximum_nights") <= 30)
示例#16
0
    def _clean_data(self, data, stored_missing_values=None):
        missing_values = {}

        data_handling = self.data_settings.get('data_handling', {})
        features_handling = data_handling.get('features_handling', {})

        # remove features by null percentage
        null_percentage = data_handling.get("feature_remove_by_null_percentage", 0.5)
        null_percentages = data.select(
            [(F.count(F.when(F.isnull(c), c)) / data.count()).alias(c) for c in data.columns]).collect()[0]
        data = data.select([c for c in data.columns if null_percentages[c] < null_percentage])

        # filling missing values by function/value
        if len(features_handling.keys()) > 0:
            missing_values = {
                k: v['fillna'] if not isinstance(v.get('fillna', 'mean'), str) else
                data.agg((eval('F.' + v.get('fillna', 'mean')))(k)).collect()[0][0]
                for (k, v) in features_handling.items()
            }

        # filling default missing features by mean
        default_missing_features = list(set(data.columns).difference(set(list(features_handling.keys()))))
        default_missing_values = data.select([F.mean(c).alias(c) for c in default_missing_features]).collect()[0]
        missing_values.update({c: default_missing_values[c] for c in default_missing_features})
        self.save_metadata('missing_values', missing_values)

        if stored_missing_values is not None:
            data = data.fillna(stored_missing_values)
        else:
            data = data.fillna(missing_values)
        return data
示例#17
0
def remove_positive_samples_from_negative_samples(neg_samples, pos_samples):
    pos_samples_to_remove = pos_samples.select("date", "hour", "street_id",
                                               lit(1).alias("exists"))
    neg_samples = (neg_samples.join(
        pos_samples_to_remove, ["date", "hour", "street_id"],
        "left_outer").filter(isnull("exists")).drop("exists"))
    return neg_samples
示例#18
0
def get_most_tornados(station_weather_data):
    w1 = Window.partitionBy("COUNTRY_FULL").orderBy(
        ["COUNTRY_FULL", "YEARMODA"])
    w2 = Window.partitionBy("DIFF").orderBy("COUNTRY_FULL")

    tornado_data = station_weather_data.select(
        ['COUNTRY_FULL', "YEARMODA", "FRSHTT"]).where("FRSHTT == '10011'")

    tornado_data = tornado_data.withColumn(
        "PREV",
        F.lag(tornado_data.YEARMODA).over(w1))

    tornado_data = tornado_data.withColumn(
        "DIFF",
        F.when(F.isnull(tornado_data.YEARMODA - tornado_data.PREV),
               0).otherwise(tornado_data.YEARMODA - tornado_data.PREV))

    tornado_data = tornado_data \
        .withColumn("GRP", F.row_number().over(w1) - F.row_number().over(w2)) \
        .withColumn("STREAK", F.row_number().over(Window.partitionBy("GRP").orderBy(["COUNTRY_FULL", "YEARMODA"])))

    first_row = tornado_data.orderBy(F.desc("STREAK")).take(1)[0]
    country = first_row[0]
    value = first_row[-1]

    return country, value
示例#19
0
    def remove_features_by_null_threshold(self, data, percentage=0.3):
        """
        Removing data with amount of 'nulls' more then the 'percentage'
        :param data: the DataFrame
        :param percentage: percentage - default 30%
        :return: pandas DataFrame
        """
        null_percentages = data.select([
            (F.count(F.when(F.isnull(c), c)) / data.count()).alias(c)
            for c in data.columns
        ]).collect()[0]

        n_features = len(data.columns)
        data = data.select(
            [c for c in data.columns if null_percentages[c] < percentage])
        new_n_features = len(data.columns)
        if n_features == new_n_features:
            print(
                "Features number was not changed, did not found null features more than %0.2f percentage"
                % percentage)
        else:
            print(
                "%d Features has removed, new data shape is (%d,%d)" %
                ((n_features - new_n_features), data.shape[0], data.shape[1]))
        return data
示例#20
0
def Sparkseeds(dict, i, k, hashDF, sc):
    word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j)
            for j in range(0,
                           len(dict[i]) - k)]
    rddW = sc.parallelize(word)
    schemaWordDF = rddW.map(
        lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2]))
    df = sqlContext.createDataFrame(schemaWordDF)
    reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner')
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)
    my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ)
    reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window))
    reDF = reDF.withColumn(
        "dist",
        F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value),
               0).otherwise(reDF.POS_SEQ - reDF.prev_value))
    reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist,
                       reDF.POS_GEN)
    reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window))
    elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50))
                       & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50)))
    reDF = reDF.subtract(elDF)
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)

    #pos = function(reDF)

    return reDF
def calculate_average_distance(vehicles_evts_df, op_prd_evts_df):
    """
    calculate average distance in an operating period of all vehicles and
    per vehicle as well.
    :param vehicles_evts_df: vehicle events DF.
    :param op_prd_evts_df: operating periods DF.
    :return: DF with col `distance`
    """
    # so that we could join both data frames.
    vehicles_evts_df = vehicles_evts_df.withColumn('key', F.lit(1))
    op_prd_evts_df = op_prd_evts_df.withColumn('key', F.lit(1))

    df_merge = vehicles_evts_df.join(op_prd_evts_df, on='key',
                                     how='left').drop('key')
    df_merge = df_merge \
        .withColumn('lng', F.toRadians('lng')) \
        .withColumn('lat', F.toRadians('lat'))

    w = Window().partitionBy('op_prd_id', 'vehicle_id').orderBy("at")

    df = df_merge.withColumn(
        'distance',
        calculate_distance('lng', 'lat',
                           F.lag('lng', 1).over(w),
                           F.lag('lat', 1).over(w))).alias('distance')

    df = df.withColumn(
        'distance',
        F.when(F.isnull(df['distance']),
               0).otherwise(df['distance'])).alias('distance')

    return df
示例#22
0
    def __init__(self):
        super(FeatureRequestIntervalVariance, self).__init__()

        self.w = Window.partitionBy(
            F.col('client_request_host'), F.col('client_ip')
        ).orderBy(F.col("@timestamp"))
        self.group_by_aggs = {
            'request_interval_var': F.variance(
                F.col('request_interval').cast('float') / 60.
            ),
        }
        self.pre_group_by_calcs = {
            'row_num_per_group':
                F.row_number().over(self.w),
            'prev_ts': F.lag(F.col('@timestamp')).over(
                self.w),
            'request_interval': F.when(
                F.col('row_num_per_group') > 1,
                F.when(
                    F.isnull(
                        F.col('@timestamp').cast('long') -
                        F.col('prev_ts').cast('long')
                    ), 0
                ).otherwise(
                    F.col('@timestamp').cast('long') -
                    F.col('prev_ts').cast('long')
                )).otherwise(None),
        }
示例#23
0
def main(
    salt,
    iterations,
    klen,
    project,
    input_table,
    output_table,
    bucket,
):
    spark = (SparkSession.builder.appName("adjust_gps_hash").getOrCreate())

    @udf("string")
    def pbkdf2_sha1hmac(msg, salt, iterations, klen):
        import hashlib
        import base64
        return base64.b64encode(
            hashlib.pbkdf2_hmac('sha1', str.encode(msg), str.encode(salt),
                                iterations, klen)).decode()

    (spark.read.format("bigquery").option("table", f"{project}.{input_table}").
     load().where(~isnull("gps_adid")).withColumn(
         "identifier",
         pbkdf2_sha1hmac(col("gps_adid"), lit(salt),
                         lit(iterations), lit(klen))).select(
                             "identifier",
                             "installed_at").write.format("bigquery").option(
                                 "table", f"{project}.{output_table}").option(
                                     "temporaryGcsBucket",
                                     bucket).mode("overwrite").save())

    spark.stop()
def remove_positive_samples_from_negative_samples(neg_samples, pos_samples):
    pos_samples_to_remove = pos_samples.select('date', 'hour', 'street_id',
                                               lit(1).alias('exists'))
    neg_samples = (neg_samples.join(
        pos_samples_to_remove, ['date', 'hour', 'street_id'],
        "left_outer").filter(isnull('exists')).drop('exists'))
    return neg_samples
示例#25
0
 def cleanNullVisitors(self, adsDf):
     totalRecords = adsDf.count()
     adsFilteredVisitors = adsDf.filter(~isnull("visitorId"))
     totalRecordsNull = adsFilteredVisitors.count()
     print("Total number of records::" + str(totalRecords))
     print("Removed visitors with null visitorId count::" +
           str(totalRecordsNull))
     return adsFilteredVisitors
示例#26
0
 def l2_to_l3(input_df):
     df = input_df
     df = df.sort(df.LAST_UPDT_DATE_TIME.desc())
     window = Window.partitionBy(df['INV_ID']).orderBy(df['LAST_UPDT_DATE_TIME'].desc())
     df = df.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') == 1)
     delete_is_null = F.isnull(F.col("LOG_DEL_IND"))
     df = df.where(delete_is_null)
     return df
示例#27
0
 def filter_df(self, df):
     return df.filter(
         ~F.isnull(df.experiments[self.experiment_slug])
     ).filter(
         df.submission_date_s3 >= self.start_date
     ).withColumn(
         'branch', df.experiments[self.experiment_slug]
     )
示例#28
0
def null_values(df):
    schema = {col: col_type for col, col_type in df.dtypes}
    nulls = [
        json.loads(x) for x in df.select([
            count(when(isnull(c), c)).alias(c) for c, type in schema.items()
            if type != "timestamp"
        ]).toJSON().collect()
    ][0]
    return nulls
示例#29
0
def get_growth(data, key):
	data = sc.parallelize(data.collect()[key][1])
	df = sqlc.createDataFrame(data, ["date", "value"])
	my_window = Window.partitionBy().orderBy("date")

	df = df.withColumn("prev_value", F.lag(df.value).over(my_window))
	df = df.withColumn("diff", F.when(F.isnull(((df.value - df.prev_value)/df.prev_value)*100), 0)
								  .otherwise((df.value - df.prev_value)/df.prev_value)*100)
	return df.rdd.map(lambda x: x.date.encode("utf-8")).collect(), df.rdd.map(lambda x: x.diff).collect()	
示例#30
0
 def get_missing(self):
     nulls = self.data_frame.select(*(when(isnull(c) == 'true', 1).otherwise(0).alias(c) for c in self.get_variables_segregated()[0]))
     agrupaded = nulls.groupBy().sum(*self.get_variables_segregated()[0])
     final = agrupaded.select(*(agrupaded['sum(' + c + ')'].alias(c) for c in self.get_variables_segregated()[0])). \
         toPandas().transpose()
     final = final.reset_index()
     return spark_session.createDataFrame(final).\
         withColumnRenamed('index', 'Variavel').\
         withColumnRenamed('0', 'missing')
 def transform(self, df, lowerPerc=None, upperPerc=None):
     
     # check if trained
     if not self._is_trained:
         raise ValueError('You mas call train method first')
     
     # lower filtering
     if lowerPerc is not None and lowerPerc>0 and lowerPerc<100:
         lowerFilter = self.ntiles_[:,int(lowerPerc)][:,1]
         for i,feature in enumerate(self.columns_):
             df = df.where((df[feature] > lowerFilter[i]) | F.isnull(df[feature]))
             
     # upper filtering
     if upperPerc is not None and upperPerc>0 and upperPerc<100:
         upperFilter = self.ntiles_[:,int(upperPerc)][:,1]
         for i,feature in enumerate(self.columns_):
             df = df.where((df[feature] < upperFilter[i]) | F.isnull(df[feature]))
             
     return df
redemptionsDf = redDf.select(redDf["EnqueuedDateTimeUTC"], redDf["jsonBody.redemption.rewardsRedemptionId"] , redDf["jsonBody.redemption.rewardsTransactionId"] , redDf["jsonBody.redemption.correlationId"] , redDf["jsonBody.redemption.eventId"] , redDf["jsonBody.redemption.programName"] , redDf["jsonBody.redemption.createdDate"] , redDf["jsonBody.redemption.pointsRedeemed"] , redDf["jsonBody.redemption.accrualCreatedDate"] , redDf["jsonBody.redemption.accrualRewardsActivityName"], redDf["jsonBody.redemption.accrualBrandCode"] , redDf["jsonBody.redemption.accrualChannelSource"] , redDf["jsonBody.redemption.modifiedDate"],redDf["jsonBody.redemption.accrualStorenumber"] , redDf["jsonBody.redemption.accrualMerchantid"], redDf["jsonBody.redemption.typeCode"])

memberBalancesDf = memBalDf.select(memBalDf["EnqueuedDateTimeUTC"], memBalDf["jsonBody.membershipPointBalance.membershipId"] , memBalDf["jsonBody.membershipPointBalance.memberId"] , memBalDf["jsonBody.membershipPointBalance.programName"] , memBalDf["jsonBody.membershipPointBalance.currentLevelPoints"] , memBalDf["jsonBody.membershipPointBalance.modifiedDate"], memBalDf["jsonBody.membershipPointBalance.eventId"])

# COMMAND ----------

# we dont need accrual record of type Reward or pointsUsed <> 0.Additionally, an accrual record gets sent when a member change status from chase to non-chase or vicecersa
accrualsDf = accrualsDf.filter("pointsUsed = '0'")
accrualsDf = accrualsDf.filter("pointType = 'Reward'")
accrualsDf = accrualsDf.filter("rewardsTransactionSubType <> 'Point Expiration Updated'")

# COMMAND ----------

# derive tendertype
from pyspark.sql import functions as F
transactionsDf = transactionsDf.withColumn("tender_type", F.when(F.isnull(transactionsDf['cardNumber']),"Non-SVC").otherwise("Non-SVC"))

# COMMAND ----------

# check the pointsAccrued and pointRedeemed columns as to whether they are number and filter only those valid number values
from pyspark.sql import functions as F
accrualsDf = accrualsDf.withColumn("pointsAccruedNumber", accrualsDf.pointsAccrued.cast('double'))
accrualsNumberErrorDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNull())
accrualsDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNotNull())
redemptionsDf = redemptionsDf.withColumn("pointsRedeemedNumber", redemptionsDf.pointsRedeemed.cast('double'))
memberBalancesDf = memberBalancesDf.withColumn("currentLevelPointsNumber", memberBalancesDf.currentLevelPoints.cast('double'))
redemptionsNumberErrorDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNull())
redemptionsDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNotNull())

# COMMAND ----------