Python isnull示例，pyspark.sql.functions.isnull Python示例

示例#1

0

显示文件

文件： conv_trip2.py 项目： primitivenen/data

def extractData(days):
    my_window = Window.partitionBy("vin").orderBy("normaltime")
    my_next_window = Window.partitionBy("vin").orderBy(desc("normaltime"))

    dfSchema = build_schema("conv")
    df = None
    for day in days:
        df_1s = load1sDataPerDay(day, dfSchema)
        df_5s = load5sDataPerDay(day)

		if not (df_1s is None or df_5s is None):
			df_tmp = df_1s.join(df_5s, ["vin", "normaltime"], "inner").withColumn("normaltime", to_timestamp(col("normaltime"), normaltimeFormat))
			df_tmp = df_tmp.withColumn("next_normaltime", F.lag(df_tmp.normaltime).over(my_next_window))
			df_tmp = df_tmp.withColumn("prev_normaltime", F.lag(df_tmp.normaltime).over(my_window))
			df_tmp = df_tmp.withColumn("prev_diff", F.when(F.isnull(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long")), 1000).otherwise(df_tmp.normaltime.cast("long") - df_tmp.prev_normaltime.cast("long")))
			df_tmp = df_tmp.withColumn("next_diff", F.when(F.isnull(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long")), 1000).otherwise(df_tmp.next_normaltime.cast("long") - df_tmp.normaltime.cast("long")))
			df_tmp = df_tmp.where("(prev_diff >= 60 or next_diff >= 60) and tel_latitudedeg > 0 and tel_longitudedeg > 0")
			print('{} starting/ending rows  ..'.format(df_tmp.count()))
			if df is None:
				df = df_tmp
			else:
				df = df.union(df_tmp)
		print('{} processing ..'.format(day))
		if not (df is None):
			print('{} rows loaded ..'.format(df.count()))

示例#2

0

显示文件

文件： CommonTransforms.py 项目： deepakrajak582/pyspark-utils

    def replaceNull(self, value, subset=None):
        isDate = False
        isTimestamp = False

        try:
            if isinstance(value, str):
                date_obj = datetime.datetime.strptime(
                    value, "%Y-%m-%d")  #YYYY-MM-DD format e.g "2020-10-01"
                isDate = True
        except ValueError:
            isDate = False

        try:
            if isinstance(value, str):
                date_obj = datetime.datetime.strptime(
                    value, "%Y-%m-%dT%H:%M:%S"
                )  #YYYY-MM-DDThh:mm:ss format e.g "2020-10-01T19:50:06"
                isTimestamp = True
        except ValueError:
            isTimestamp = False

        if isDate and subset is not None:
            dateCol = (x for x in self.inputSchema
                       if str(x.dataType) == "DateType" and x.nullable == True
                       and x.name in subset)
            for x in dateCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isDate and subset is None:
            dateCol = (x for x in self.inputSchema
                       if str(x.dataType) == "DateType" and x.nullable == True)
            for x in dateCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isTimestamp and subset is not None:
            tsCol = (x for x in self.inputSchema
                     if str(x.dataType) == "TimestampType"
                     and x.nullable == True and x.name in subset)
            for x in tsCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        elif isTimestamp and subset is None:
            tsCol = (
                x for x in self.inputSchema
                if str(x.dataType) == "TimestampType" and x.nullable == True)
            for x in tsCol:
                self.inputDf = self.inputDf.withColumn(
                    x.name,
                    when(isnull(col(x.name)),
                         lit(value)).otherwise(col(x.name)))
        else:
            self.inputDf = self.inputDf.fillna(value, subset)

        return self.inputDf

示例#3

0

显示文件

文件： recommender.py 项目： jessdaubner/ml-exercise

def preprocess(df):
    """Drops null values in customer_id and global_product_id and
       calculates purchase count of each product by customer.
    Args:
       df (DataFrame): raw data from CSV file
    Returns:
       DataFrame with columns customer_id, global_product_id, count
    """
    logger.info('Raw data contains {:,} rows'.format(df.count()))
    df = (df.select(
        col('customer_id').cast('integer'),
        col('global_product_id').cast('integer')).filter(
            ~isnull('customer_id') & ~isnull('global_product_id')))
    df.cache()
    logger.info('Cleaned data contains {:,} rows'.format(df.count()))

    df = df.groupBy('customer_id', 'global_product_id').count()
    logger.info('Customer product purchases contains {:,} rows'.format(
        df.count()))
    logger.info('Customer product purchases contains {:,} customers'.format(
        df.select('customer_id').distinct().count()))
    logger.info('Customer product purchases contains {:,} products'.format(
        df.select('global_product_id').distinct().count()))
    logger.info('Customer total product purchases summary statistics:')
    df.select('count').describe().show()
    return df

示例#4

0

显示文件

文件： spark_functions.py 项目： wikimedia/ores_bias_project

def add_has_user_page(wmhist, page_history, remember_dict):
    user_pages = page_history.filter(f.col("page_namespace_historical")==2)
    user_pages = user_pages.select([f.col("wiki_db").alias("up_wiki_db"),
                                    f.col("page_id").alias("user_page_id"),
                                    f.col("page_title_historical").alias("user_page_title"),
                                    f.col("page_first_edit_timestamp").alias("user_page_first_edit"),
                                    f.col("start_timestamp").alias("user_page_start_timestamp"),
                                    f.col("end_timestamp").alias("user_page_end_timestamp")
])

    user_pages = user_pages.filter( (f.col("page_is_redirect") == False)
                                    & (f.col("page_is_deleted") == False))

    join_cond = [wmhist.wiki_db == user_pages.up_wiki_db,
                 wmhist.event_user_text_historical == user_pages.user_page_title,
                 wmhist.event_timestamp > user_pages.user_page_first_edit,
                 wmhist.event_timestamp >= user_pages.user_page_start_timestamp,
                 ((wmhist.event_timestamp < user_pages.user_page_end_timestamp) | f.isnull(f.col("user_page_end_timestamp")))]

    

    wmhist = wmhist.join(user_pages, on = join_cond, how="left_outer")

    wmhist = wmhist.withColumn("has_user_page", f.isnull(wmhist.user_page_id) == False)

    return((wmhist, remember_dict))

示例#5

0

显示文件

def cleanup_no_activity_rows(df, activity_field='activity_dt'):
    """
    each df that was joined can produce "empty" activity rows for
    clients that didn't have activity in that df's activity
    this can blow up so you have "empty" rows for clients that
    did have activity in one of the dfs. this just cleans it up
    so that (client_id, branch) combos that have zero activity
    only get one row
    """

    df_has_activity = df.filter("{} is not null".format(activity_field))\
                        .select([
                            F.col('client_id').alias('client_id_temp'),
                            F.col('branch').alias('branch_temp')
                                ]).distinct()

    df = df.join(df_has_activity,
                 F.isnull(F.col('activity_dt'))
                 & (F.col('client_id') == F.col('client_id_temp'))
                 & (F.col('branch') == F.col('branch_temp')),
                 how='left')
    df = df.filter(F.isnull(F.col('client_id_temp')))
    df = df.drop('client_id_temp').drop('branch_temp')

    return df

示例#6

0

显示文件

    def _get_telemetry_sanity_check_metrics(self, enrollments, df):
        """Return aggregations that check for problems with a client."""

        # TODO: Once we know what form the metrics library will take,
        # we should move the below metric definitions and documentation
        # into it.

        if dict(df.dtypes).get('experiments') != 'map<string,string>':
            # Not all tables have an experiments map - can't make these checks.
            return []

        return [

            # Check to see whether the client_id is also enrolled in other branches
            # E.g. indicates cloned profiles. Fraction of such users should be
            # small, and similar between branches.
            F.max(
                F.coalesce((df.experiments[self.experiment_slug] !=
                            enrollments.branch).astype('int'),
                           F.lit(0))).alias('has_contradictory_branch'),

            # Check to see whether the client_id was sending data in the conversion
            # window that wasn't tagged as being part of the experiment. Indicates
            # either a client_id clash, or the client unenrolling. Fraction of such
            # users should be small, and similar between branches.
            F.max(
                F.coalesce(
                    (~F.isnull(df.experiments)
                     & F.isnull(
                         df.experiments[self.experiment_slug])).astype('int'),
                    F.lit(0))).alias('has_non_enrolled_data'),
        ]

示例#7

0

显示文件

文件： detail-dump.py 项目： imyoungyang/myAWSStudyBlog

def handle_missing_get_indicator_column(df, input_column, expected_type):
    """Helper function used to get an indicator for all missing values."""
    dcol = df[input_column].cast(expected_type)
    if isinstance(expected_type, StringType):
        indicator = sf.isnull(dcol) | (sf.trim(dcol) == "")
    else:
        indicator = sf.isnull(dcol) | sf.isnan(dcol)
    return indicator

示例#8

0

显示文件

文件： dsc102pa2.py 项目： jonxsong/ESFSA

def task_1(data_io, review_data, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    asin_column = 'asin'
    overall_column = 'overall'
    # Outputs:
    mean_rating_column = 'meanRating'
    count_rating_column = 'countRating'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    data = review_data.groupBy(F.col(asin_column)).agg(
        F.avg(F.col(overall_column)).alias(mean_rating_column),
        F.count("*").alias(count_rating_column))

    merged = product_data.join(data, on=asin_column, how='left')

    aggregate_func = merged.agg(
        F.count("*"), F.avg(F.col(mean_rating_column)),
        F.variance(F.col(mean_rating_column)),
        F.sum(F.isnull(F.col(mean_rating_column)).astype("int")),
        F.avg(F.col(count_rating_column)),
        F.variance(F.col(count_rating_column)),
        F.sum(F.isnull(F.col(count_rating_column)).astype("int"))).collect()[0]

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    # Calculate the values programmaticly. Do not change the keys and do not
    # hard-code values in the dict. Your submission will be evaluated with
    # different inputs.
    # Modify the values of the following dictionary accordingly.
    res = {
        'count_total': None,
        'mean_meanRating': None,
        'variance_meanRating': None,
        'numNulls_meanRating': None,
        'mean_countRating': None,
        'variance_countRating': None,
        'numNulls_countRating': None
    }
    # Modify res:

    res['count_total'] = aggregate_func[0]
    res['mean_meanRating'] = aggregate_func[1]
    res['variance_meanRating'] = aggregate_func[2]
    res['numNulls_meanRating'] = aggregate_func[3]
    res['mean_countRating'] = aggregate_func[4]
    res['variance_countRating'] = aggregate_func[5]
    res['numNulls_countRating'] = aggregate_func[6]

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_1')
    return res

示例#9

0

显示文件

文件： valid_rules.py 项目： luanluankuaile/python_big_data

 def check(decimal_columns):
     precision=38
     scale=10
     if type(decimal_columns)==tuple:
         name=decimal_columns[0]
         precision=decimal_columns[1]
         scale=decimal_columns[2]
     else:
         name=decimal_columns
     return ~isnull(col(name))&isnull(col(name).cast(DecimalType(precision,scale)))

示例#10

0

显示文件

 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df

示例#11

0

显示文件

文件： item_profile_parse.py 项目： guoguolan1991/dl_rank

def union_label_feature(labelDF, itemprofileDF):

    DF1 = labelDF.join(
        itemprofileDF,
        labelDF.item2 == itemprofileDF.pid, "left_outer").withColumnRenamed(
            "value",
            "features2").drop("pid").where(F.isnull('features2') == False)
    DF2 = DF1.join(
        itemprofileDF,
        DF1.item1 == itemprofileDF.pid, "left_outer").withColumnRenamed(
            'value',
            'features1').drop("pid").where(F.isnull('features1') == False)
    return DF2

示例#12

0

显示文件

def add_duration_id(spark, df, logger):
    """Calculate the visitduration_id by splitting the visit duration into buckets"""
    durdays_df = df.withColumn("duration_days", datediff("depdate", "arrdate"))
    ddbucketizer = Bucketizer(splits=[ float('-Inf'), 0, 4, 8, 11, 15, 22,
                                        29, float('Inf') ],
                                        inputCol="duration_days",
                        outputCol="ddbuckets")
    ddbuck_df = ddbucketizer.setHandleInvalid("keep").transform(durdays_df)
    dur_id_df = ddbuck_df.withColumn("visitduration_id",
                                   when(isnull(col("arrdate")) |
                                        isnull(col("depdate")), 999)\
                                   .otherwise(col("ddbuckets").cast(IntegerType()))
                                 )
    logger.info("Added duration_id")
    return dur_id_df

示例#13

0

显示文件

文件： parquet-tranform.py 项目： simon-parker01/data-engineer-test

def test_null(frame, to_check):
    frame = frame.toDF()
    test_result = [
        x for x in to_check
        if frame.select(count(when(isnull(x), x))).collect()[0][0] > 0
    ]
    return len(test_result) > 0, ", ".join(test_result) + "contain null values"

示例#14

0

显示文件

文件： get_nulls.py 项目： paaarx/pyspark_util

def get_nulls(df):
    """
    https://github.com/paaarx
    Returns a DataFrame with null count.

    For every column in DataFrame, analyzes each row to search for null values
    and count how many are found.
    Only columns with null values will be returned.

    Parameters:
        df (DataFrame): The DataFrame to be analyzed.

    Returns:
        DataFrame: DataFrame with columns and null count.
    """

    expression = []

    for column in df.columns:
        expression.append(count(when(isnull(column), column)).alias(column))

    df_with_nulls = df.select(expression)

    column_list = []

    for key, value in df_with_nulls.collect()[0].asDict().items():
        if value > 0:
            column_list.append(key)

    return df_with_nulls.select(column_list)

示例#15

0

显示文件

def transformCsvToDbDataFile(csvFile):
    """
    Prepare a suitable format to be saved into Postgres database.
    General rules are
    1. Look for minimum nights >= 5 & maximum nights <= 30
    2. Look amenities such as Wifi, TV, and Internet
    3. Replace the $ signs in price & weekly_price with empty string
    4. Convert minimum nights and maximum nights to Integer
    5. Convert price & weekly_price to Double
    6. When weekly_price is null then set it as 0
    """

    return csvFile.select("id", "listing_url", "amenities", "minimum_nights",
                          "maximum_nights", "price", "weekly_price", "city", "country")\
        .filter(csvFile["amenities"].contains("Internet"))\
        .filter(csvFile["amenities"].contains("Wifi"))\
        .filter(csvFile["amenities"].contains("TV"))\
        .filter(csvFile["price"].contains("$"))\
        .withColumn("id", csvFile["id"].cast(IntegerType()))\
        .withColumn("minimum_nights", csvFile["minimum_nights"].cast(IntegerType()))\
        .withColumn("maximum_nights", csvFile["maximum_nights"].cast(IntegerType()))\
        .withColumn("price", F.regexp_replace("price", "\\$", "").cast(DoubleType()))\
        .withColumn("weekly_price", F.when(F.isnull(csvFile["weekly_price"]), 0.0)
                    .otherwise(F.regexp_replace("weekly_price", "\\$", "").cast(DoubleType())))\
        .where(csvFile.colRegex("minimum_nights") >= 5)\
        .where(csvFile.colRegex("maximum_nights") <= 30)

示例#16

0

显示文件

    def _clean_data(self, data, stored_missing_values=None):
        missing_values = {}

        data_handling = self.data_settings.get('data_handling', {})
        features_handling = data_handling.get('features_handling', {})

        # remove features by null percentage
        null_percentage = data_handling.get("feature_remove_by_null_percentage", 0.5)
        null_percentages = data.select(
            [(F.count(F.when(F.isnull(c), c)) / data.count()).alias(c) for c in data.columns]).collect()[0]
        data = data.select([c for c in data.columns if null_percentages[c] < null_percentage])

        # filling missing values by function/value
        if len(features_handling.keys()) > 0:
            missing_values = {
                k: v['fillna'] if not isinstance(v.get('fillna', 'mean'), str) else
                data.agg((eval('F.' + v.get('fillna', 'mean')))(k)).collect()[0][0]
                for (k, v) in features_handling.items()
            }

        # filling default missing features by mean
        default_missing_features = list(set(data.columns).difference(set(list(features_handling.keys()))))
        default_missing_values = data.select([F.mean(c).alias(c) for c in default_missing_features]).collect()[0]
        missing_values.update({c: default_missing_values[c] for c in default_missing_features})
        self.save_metadata('missing_values', missing_values)

        if stored_missing_values is not None:
            data = data.fillna(stored_missing_values)
        else:
            data = data.fillna(missing_values)
        return data

示例#17

0

显示文件

def remove_positive_samples_from_negative_samples(neg_samples, pos_samples):
    pos_samples_to_remove = pos_samples.select("date", "hour", "street_id",
                                               lit(1).alias("exists"))
    neg_samples = (neg_samples.join(
        pos_samples_to_remove, ["date", "hour", "street_id"],
        "left_outer").filter(isnull("exists")).drop("exists"))
    return neg_samples

示例#18

0

显示文件

def get_most_tornados(station_weather_data):
    w1 = Window.partitionBy("COUNTRY_FULL").orderBy(
        ["COUNTRY_FULL", "YEARMODA"])
    w2 = Window.partitionBy("DIFF").orderBy("COUNTRY_FULL")

    tornado_data = station_weather_data.select(
        ['COUNTRY_FULL', "YEARMODA", "FRSHTT"]).where("FRSHTT == '10011'")

    tornado_data = tornado_data.withColumn(
        "PREV",
        F.lag(tornado_data.YEARMODA).over(w1))

    tornado_data = tornado_data.withColumn(
        "DIFF",
        F.when(F.isnull(tornado_data.YEARMODA - tornado_data.PREV),
               0).otherwise(tornado_data.YEARMODA - tornado_data.PREV))

    tornado_data = tornado_data \
        .withColumn("GRP", F.row_number().over(w1) - F.row_number().over(w2)) \
        .withColumn("STREAK", F.row_number().over(Window.partitionBy("GRP").orderBy(["COUNTRY_FULL", "YEARMODA"])))

    first_row = tornado_data.orderBy(F.desc("STREAK")).take(1)[0]
    country = first_row[0]
    value = first_row[-1]

    return country, value

示例#19

0

显示文件

    def remove_features_by_null_threshold(self, data, percentage=0.3):
        """
        Removing data with amount of 'nulls' more then the 'percentage'
        :param data: the DataFrame
        :param percentage: percentage - default 30%
        :return: pandas DataFrame
        """
        null_percentages = data.select([
            (F.count(F.when(F.isnull(c), c)) / data.count()).alias(c)
            for c in data.columns
        ]).collect()[0]

        n_features = len(data.columns)
        data = data.select(
            [c for c in data.columns if null_percentages[c] < percentage])
        new_n_features = len(data.columns)
        if n_features == new_n_features:
            print(
                "Features number was not changed, did not found null features more than %0.2f percentage"
                % percentage)
        else:
            print(
                "%d Features has removed, new data shape is (%d,%d)" %
                ((n_features - new_n_features), data.shape[0], data.shape[1]))
        return data

示例#20

0

显示文件

def Sparkseeds(dict, i, k, hashDF, sc):
    word = [(i, HashTable.hash_djb2(dict[i][j:j + k]), j)
            for j in range(0,
                           len(dict[i]) - k)]
    rddW = sc.parallelize(word)
    schemaWordDF = rddW.map(
        lambda x: Row(NUM_SEQ=x[0], ID_SEQ=x[1], POS_SEQ=x[2]))
    df = sqlContext.createDataFrame(schemaWordDF)
    reDF = df.join(hashDF, df.ID_SEQ == hashDF.ID_GEN, how='inner')
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)
    my_window = Window.partitionBy(reDF.NUM_SEQ).orderBy(reDF.POS_SEQ)
    reDF = reDF.withColumn("prev_value", F.lag(reDF.POS_SEQ).over(my_window))
    reDF = reDF.withColumn(
        "dist",
        F.when(F.isnull(reDF.POS_SEQ - reDF.prev_value),
               0).otherwise(reDF.POS_SEQ - reDF.prev_value))
    reDF = reDF.select(reDF.NUM_SEQ, reDF.ID_SEQ, reDF.POS_SEQ, reDF.dist,
                       reDF.POS_GEN)
    reDF = reDF.withColumn("dist0", F.lead(reDF.dist).over(my_window))
    elDF = reDF.filter(((reDF.dist == 0) | (reDF.dist >= 50))
                       & ((reDF.dist0.isNull()) | (reDF.dist0 >= 50)))
    reDF = reDF.subtract(elDF)
    reDF = reDF.orderBy(reDF.POS_SEQ).select(reDF.NUM_SEQ, reDF.ID_SEQ,
                                             reDF.POS_SEQ, reDF.POS_GEN)

    #pos = function(reDF)

    return reDF

示例#21

0

显示文件

文件： distance_calculator.py 项目： MaryamAlam15/average_distance_calculator

def calculate_average_distance(vehicles_evts_df, op_prd_evts_df):
    """
    calculate average distance in an operating period of all vehicles and
    per vehicle as well.
    :param vehicles_evts_df: vehicle events DF.
    :param op_prd_evts_df: operating periods DF.
    :return: DF with col `distance`
    """
    # so that we could join both data frames.
    vehicles_evts_df = vehicles_evts_df.withColumn('key', F.lit(1))
    op_prd_evts_df = op_prd_evts_df.withColumn('key', F.lit(1))

    df_merge = vehicles_evts_df.join(op_prd_evts_df, on='key',
                                     how='left').drop('key')
    df_merge = df_merge \
        .withColumn('lng', F.toRadians('lng')) \
        .withColumn('lat', F.toRadians('lat'))

    w = Window().partitionBy('op_prd_id', 'vehicle_id').orderBy("at")

    df = df_merge.withColumn(
        'distance',
        calculate_distance('lng', 'lat',
                           F.lag('lng', 1).over(w),
                           F.lag('lat', 1).over(w))).alias('distance')

    df = df.withColumn(
        'distance',
        F.when(F.isnull(df['distance']),
               0).otherwise(df['distance'])).alias('distance')

    return df

示例#22

0

显示文件

    def __init__(self):
        super(FeatureRequestIntervalVariance, self).__init__()

        self.w = Window.partitionBy(
            F.col('client_request_host'), F.col('client_ip')
        ).orderBy(F.col("@timestamp"))
        self.group_by_aggs = {
            'request_interval_var': F.variance(
                F.col('request_interval').cast('float') / 60.
            ),
        }
        self.pre_group_by_calcs = {
            'row_num_per_group':
                F.row_number().over(self.w),
            'prev_ts': F.lag(F.col('@timestamp')).over(
                self.w),
            'request_interval': F.when(
                F.col('row_num_per_group') > 1,
                F.when(
                    F.isnull(
                        F.col('@timestamp').cast('long') -
                        F.col('prev_ts').cast('long')
                    ), 0
                ).otherwise(
                    F.col('@timestamp').cast('long') -
                    F.col('prev_ts').cast('long')
                )).otherwise(None),
        }

示例#23

0

显示文件

def main(
    salt,
    iterations,
    klen,
    project,
    input_table,
    output_table,
    bucket,
):
    spark = (SparkSession.builder.appName("adjust_gps_hash").getOrCreate())

    @udf("string")
    def pbkdf2_sha1hmac(msg, salt, iterations, klen):
        import hashlib
        import base64
        return base64.b64encode(
            hashlib.pbkdf2_hmac('sha1', str.encode(msg), str.encode(salt),
                                iterations, klen)).decode()

    (spark.read.format("bigquery").option("table", f"{project}.{input_table}").
     load().where(~isnull("gps_adid")).withColumn(
         "identifier",
         pbkdf2_sha1hmac(col("gps_adid"), lit(salt),
                         lit(iterations), lit(klen))).select(
                             "identifier",
                             "installed_at").write.format("bigquery").option(
                                 "table", f"{project}.{output_table}").option(
                                     "temporaryGcsBucket",
                                     bucket).mode("overwrite").save())

    spark.stop()

示例#24

0

显示文件

文件： preprocess.py 项目： todkang/accident-prediction-montreal

def remove_positive_samples_from_negative_samples(neg_samples, pos_samples):
    pos_samples_to_remove = pos_samples.select('date', 'hour', 'street_id',
                                               lit(1).alias('exists'))
    neg_samples = (neg_samples.join(
        pos_samples_to_remove, ['date', 'hour', 'street_id'],
        "left_outer").filter(isnull('exists')).drop('exists'))
    return neg_samples

示例#25

0

显示文件

文件： adtransformutil.py 项目： kushal-juneja91/adflow

 def cleanNullVisitors(self, adsDf):
     totalRecords = adsDf.count()
     adsFilteredVisitors = adsDf.filter(~isnull("visitorId"))
     totalRecordsNull = adsFilteredVisitors.count()
     print("Total number of records::" + str(totalRecords))
     print("Removed visitors with null visitorId count::" +
           str(totalRecordsNull))
     return adsFilteredVisitors

示例#26

0

显示文件

文件： ISP_IOP_HEADER.py 项目： rachitcode/Palantir

 def l2_to_l3(input_df):
     df = input_df
     df = df.sort(df.LAST_UPDT_DATE_TIME.desc())
     window = Window.partitionBy(df['INV_ID']).orderBy(df['LAST_UPDT_DATE_TIME'].desc())
     df = df.select('*', row_number().over(window).alias('row_number')).filter(col('row_number') == 1)
     delete_is_null = F.isnull(F.col("LOG_DEL_IND"))
     df = df.where(delete_is_null)
     return df

示例#27

0

显示文件

 def filter_df(self, df):
     return df.filter(
         ~F.isnull(df.experiments[self.experiment_slug])
     ).filter(
         df.submission_date_s3 >= self.start_date
     ).withColumn(
         'branch', df.experiments[self.experiment_slug]
     )

示例#28

0

显示文件

文件： utils.py 项目： pissall20/hdd_failure

def null_values(df):
    schema = {col: col_type for col, col_type in df.dtypes}
    nulls = [
        json.loads(x) for x in df.select([
            count(when(isnull(c), c)).alias(c) for c, type in schema.items()
            if type != "timestamp"
        ]).toJSON().collect()
    ][0]
    return nulls

示例#29

0

显示文件

文件： taxi_vs_uber.py 项目： xzou000/taxi_vs_uber

def get_growth(data, key):
	data = sc.parallelize(data.collect()[key][1])
	df = sqlc.createDataFrame(data, ["date", "value"])
	my_window = Window.partitionBy().orderBy("date")

	df = df.withColumn("prev_value", F.lag(df.value).over(my_window))
	df = df.withColumn("diff", F.when(F.isnull(((df.value - df.prev_value)/df.prev_value)*100), 0)
								  .otherwise((df.value - df.prev_value)/df.prev_value)*100)
	return df.rdd.map(lambda x: x.date.encode("utf-8")).collect(), df.rdd.map(lambda x: x.diff).collect()

示例#30

0

显示文件

 def get_missing(self):
     nulls = self.data_frame.select(*(when(isnull(c) == 'true', 1).otherwise(0).alias(c) for c in self.get_variables_segregated()[0]))
     agrupaded = nulls.groupBy().sum(*self.get_variables_segregated()[0])
     final = agrupaded.select(*(agrupaded['sum(' + c + ')'].alias(c) for c in self.get_variables_segregated()[0])). \
         toPandas().transpose()
     final = final.reset_index()
     return spark_session.createDataFrame(final).\
         withColumnRenamed('index', 'Variavel').\
         withColumnRenamed('0', 'missing')

示例#31

0

显示文件

文件： outlier_remover.py 项目： elmi-gemini/pyspark_utils

 def transform(self, df, lowerPerc=None, upperPerc=None):
     
     # check if trained
     if not self._is_trained:
         raise ValueError('You mas call train method first')
     
     # lower filtering
     if lowerPerc is not None and lowerPerc>0 and lowerPerc<100:
         lowerFilter = self.ntiles_[:,int(lowerPerc)][:,1]
         for i,feature in enumerate(self.columns_):
             df = df.where((df[feature] > lowerFilter[i]) | F.isnull(df[feature]))
             
     # upper filtering
     if upperPerc is not None and upperPerc>0 and upperPerc<100:
         upperFilter = self.ntiles_[:,int(upperPerc)][:,1]
         for i,feature in enumerate(self.columns_):
             df = df.where((df[feature] < upperFilter[i]) | F.isnull(df[feature]))
             
     return df

示例#32

0

显示文件

文件： mtrpocnotebook.py 项目： srinivasavasu/OracleAppsGuy

redemptionsDf = redDf.select(redDf["EnqueuedDateTimeUTC"], redDf["jsonBody.redemption.rewardsRedemptionId"] , redDf["jsonBody.redemption.rewardsTransactionId"] , redDf["jsonBody.redemption.correlationId"] , redDf["jsonBody.redemption.eventId"] , redDf["jsonBody.redemption.programName"] , redDf["jsonBody.redemption.createdDate"] , redDf["jsonBody.redemption.pointsRedeemed"] , redDf["jsonBody.redemption.accrualCreatedDate"] , redDf["jsonBody.redemption.accrualRewardsActivityName"], redDf["jsonBody.redemption.accrualBrandCode"] , redDf["jsonBody.redemption.accrualChannelSource"] , redDf["jsonBody.redemption.modifiedDate"],redDf["jsonBody.redemption.accrualStorenumber"] , redDf["jsonBody.redemption.accrualMerchantid"], redDf["jsonBody.redemption.typeCode"])

memberBalancesDf = memBalDf.select(memBalDf["EnqueuedDateTimeUTC"], memBalDf["jsonBody.membershipPointBalance.membershipId"] , memBalDf["jsonBody.membershipPointBalance.memberId"] , memBalDf["jsonBody.membershipPointBalance.programName"] , memBalDf["jsonBody.membershipPointBalance.currentLevelPoints"] , memBalDf["jsonBody.membershipPointBalance.modifiedDate"], memBalDf["jsonBody.membershipPointBalance.eventId"])

# COMMAND ----------

# we dont need accrual record of type Reward or pointsUsed <> 0.Additionally, an accrual record gets sent when a member change status from chase to non-chase or vicecersa
accrualsDf = accrualsDf.filter("pointsUsed = '0'")
accrualsDf = accrualsDf.filter("pointType = 'Reward'")
accrualsDf = accrualsDf.filter("rewardsTransactionSubType <> 'Point Expiration Updated'")

# COMMAND ----------

# derive tendertype
from pyspark.sql import functions as F
transactionsDf = transactionsDf.withColumn("tender_type", F.when(F.isnull(transactionsDf['cardNumber']),"Non-SVC").otherwise("Non-SVC"))

# COMMAND ----------

# check the pointsAccrued and pointRedeemed columns as to whether they are number and filter only those valid number values
from pyspark.sql import functions as F
accrualsDf = accrualsDf.withColumn("pointsAccruedNumber", accrualsDf.pointsAccrued.cast('double'))
accrualsNumberErrorDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNull())
accrualsDf = accrualsDf.where(accrualsDf["pointsAccruedNumber"].isNotNull())
redemptionsDf = redemptionsDf.withColumn("pointsRedeemedNumber", redemptionsDf.pointsRedeemed.cast('double'))
memberBalancesDf = memberBalancesDf.withColumn("currentLevelPointsNumber", memberBalancesDf.currentLevelPoints.cast('double'))
redemptionsNumberErrorDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNull())
redemptionsDf = redemptionsDf.where(redemptionsDf["pointsRedeemedNumber"].isNotNull())

# COMMAND ----------