示例#1
0
def filter_outliers(dataframe, exclude_columns):
    """
    For every feature, except those in exclude_columns, set all outliers to NULL.
    """
    for column in dataframe.columns:
        if column in exclude_columns:
            continue
        # Exclude boolean types.
        if dataframe.schema[column].dataType == BooleanType():
            continue
        stats = dataframe \
            .select(_mean(col(column)).alias('mean'), stddev(col(column)).alias('std')) \
            .collect()
        mean = stats[0]['mean']
        std = stats[0]['std']
        print("mean: %s; std: %s" % (str(mean), str(std)))
        count_before = dataframe.filter(col(column).isNull()).count()
        dataframe = dataframe.withColumn(
            column,
            when(abs((col(column) - mean) / std) < 3,
                 col(column)).otherwise(None))
        print("Deleted %s entries because of z-score (3) for %s." % (
            str(dataframe.filter(col(column).isNull()).count() - count_before),
            column))
    return dataframe
示例#2
0
    def find(self, data, spark_context):
        rows, columns = spark_shape(data)
        n_clusters = get_n_clusters(data, data.columns[-1])
        columns -= 2
        mean_columns = map(lambda x: _mean(col(x)).alias('mean'),
                           data.columns[:-2])
        df_stats = data.select(*mean_columns).collect()
        df = add_iter(data)
        self.x_center = np.array(df_stats[0])
        self.centroids = cluster_centroid(df, spark_context, n_clusters, 3)
        self.diameter = find_diameter(df, spark_context, 3)
        ch = float(rows - n_clusters) / float(n_clusters - 1)

        self.cluster_sizes = count_cluster_sizes(df, n_clusters, spark_context,
                                                 3)
        self.numerator = [0 for _ in range(n_clusters)]
        for i in range(0, n_clusters):
            self.numerator[i] = self.cluster_sizes[i] * euclidian_dist(
                self.centroids[i], self.x_center)
        denominator_sum = spark_context.accumulator(0)

        def f(row, denominator_sum, centroind):
            denominator_sum += np.sqrt(
                np.sum(np.square(np.array(row[:-3]) - centroind[row[-2]])))

        df.rdd.foreach(lambda row: f(row, denominator_sum, self.centroids))

        self.denominator = denominator_sum.value
        ch *= np.sum(self.numerator)
        ch /= self.denominator
        return -ch
示例#3
0
def get_mean_and_std(all_harvest_df):
    # https://stackoverflow.com/a/47995478
    df_stats = all_harvest_df.select(
        _mean(col('yield')).alias('mean'),
        _stddev(col('yield')).alias('std')).collect()
    mean = df_stats[0]['mean']
    std = df_stats[0]['std']
    return mean, std
示例#4
0
 def normalize_grade(self, df):
     print("Normalizing data ...")
     mean_df = df.groupBy(self.item_col).agg(
         _mean(self.grade_col).alias("mean"))
     #mean_df.show(100)
     df = df.join(mean_df, [self.item_col])
     #df.show(100)
     df = df.withColumn(self.grade_col, col(self.grade_col) - col("mean"))
     #df.show(100)
     return df, mean_df
示例#5
0
def get_prcp_day(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS POR DIA')
    df_out = df_in.groupby('city', 'yr', 'mo', 'da').agg(
        _abs(_max('lat')).alias('latitude'),
        _sum('prcp').alias('prcp_dia'),
        _max('tmax').alias('tmax'),
        _min('tmin').alias('tmin'),
        _mean('temp').alias('med_temp')).orderBy('city', 'yr', 'mo', 'da')
    return df_out
def mean_stdv(df):
    unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())
    for i in ["count"]:
        assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")
        scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
        pipeline = Pipeline(stages=[assembler, scaler])
        df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")
        df_stats = df.select(_mean(col('count_Scaled')).alias('mean'),_stddev(col('count_Scaled')).alias('std')).collect()
        # mean = df_stats[0]['mean']
        # std = df_stats[0]['std']
        return df_stats 
示例#7
0
 def normalize_grade(self, df):
     # print("Normalizing data ...")
     mean_df = df.groupBy(self.user_col).agg(
         _mean(self.grade_col).alias("mean"))
     mean_df_rename = mean_df.withColumnRenamed(self.user_col, "USER")
     #mean_df.show(100)
     df = df.join(mean_df_rename,
                  df[self.user_col] == mean_df_rename["USER"]).drop("USER")
     #df.show(100)
     df = df.withColumn(self.grade_col, col(self.grade_col) - col("mean"))
     #df.show(100)
     return df, mean_df_rename
    async def df_sbx_customer_special_box_purchased(self, sbx, spark):
        data = await  sbx.with_model('cart_box') \
            .set_page_size(1000) \
            .and_where_is_not_null('purchase') \
            .and_where_is_equal('variety', os.environ['SPECIAL_BOX']).find()
        sc = spark.sparkContext

        def deleteMeta(d):
            dt = {}
            dt['customer'] = d['customer']
            dt['total_items'] = d['total_items']
            dt['current_percentage'] = d['current_percentage']
            dt['count'] = 1
            return dt

        dit = list(map(deleteMeta, data['results']))
        tmp = sc.parallelize(dit, numSlices=100)
        df = spark.read.option("multiLine", "true").json(tmp)
        df2 = df.groupBy("customer").agg(
            func.avg("total_items").alias('total_items'),
            func.avg("current_percentage").alias('current_percentage'),
            func.sum("count").alias('count'))

        (cumean, custd, comean, costd, tmean, tstd) = df2.select(
            _mean(col('current_percentage')).alias('cumean'),
            _stddev(col('current_percentage')).alias('custd'),
            _mean(col('count')).alias('comean'),
            _stddev(col('count')).alias('costd'),
            _mean(col('total_items')).alias('total_items'),
            _stddev(col('total_items')).alias('total_items'),
        ).first()
        df3 = df2.withColumn(
            "acurrent_percentage",
            (col("current_percentage") - cumean) / custd).withColumn(
                "acount", (col("count") - comean) / costd).withColumn(
                    "atotal_items", (col("total_items") - tmean) / tstd)
        vecAssembler = VectorAssembler(
            inputCols=["acurrent_percentage", "acount", "atotal_items"],
            outputCol="features")
        return vecAssembler.transform(df3)
示例#9
0
def get_metrics(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS')
    df_out = df_in.groupby('yr', 'city').agg(
        _max('elvt').alias('ele_max'),
        _mean('temp').alias('med_temp'),
        _mean('tmin').alias('med_temp_min'),
        _mean('tmax').alias('med_temp_max'),
        _sum('prcp').alias('prcp'),
        _mean('hmdy').alias('med_umi'),
        _mean('hmin').alias('med_umi_min'),
        _mean('hmax').alias('med_umi_max'),
        _mean('wdsp').alias('med_velo_vento'),
        _mean('gust').alias('med_velo_rajadas_vento')).orderBy('yr', 'city')
    return df_out
示例#10
0
def stream_to_control_chart(time, rdd, streaming_dict, id):
    if rdd.isEmpty():
        return
    df = rdd.map(lambda x: Row(**x)).toDF()
    columns = df.schema.names
    conditions_mean = [_mean(col(column)).alias(column) for column in columns]

    df = df.select(conditions_mean).toPandas()
    df["time_stamp"] = time.timestamp() * 1000

    if id in streaming_dict:
        streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True)
    else:
        streaming_dict[id] = df
示例#11
0
def incomeZScore():
    df_stats = dataset.select(
        _mean(col('Average_Income')).alias('mean'),
        _stddev(col('Average_Income')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Average_Income'] - mean) / std).withColumnRenamed(
            "((Average_Income - 58348.17333333333) / 9095.510688184871)",
            "z_score_AvgInc").alias("z_score_AvgInc")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = dataset.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    return final_df
示例#12
0
def crimeZScore():
    df2 = unemploymentZScore()
    df_stats = dataset.select(
        _mean(col('Crime_Percent')).alias('mean'),
        _stddev(col('Crime_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Crime_Percent'] - mean) / std).withColumnRenamed(
            "((Crime_Percent - 3.2683999999999975) / 0.8328317973490115)",
            "z_score_Crime")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
示例#13
0
def unemploymentZScore():
    df2 = incomeZScore()
    df_stats = dataset.select(
        _mean(col('Unemployment_Percent')).alias('mean'),
        _stddev(col('Unemployment_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Unemployment_Percent'] - mean) / std).withColumnRenamed(
            "((Unemployment_Percent - 7.450666666666669) / 2.512157640140963)",
            "z_score_Unem")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
示例#14
0
def homelessZScore():
    df2 = crimeZScore()
    df_stats = dataset.select(
        _mean(col('Homeless_Percent')).alias('mean'),
        _stddev(col('Homeless_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Homeless_Percent'] - mean) / std).withColumnRenamed(
            "((Homeless_Percent - 0.17706666666666662) / 0.09455791640084463)",
            "z_score_Homeless")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
示例#15
0
def AverageAndStd(time, rdd, streaming_dict, id):
    if rdd.isEmpty():
        return
    df = rdd.map(lambda x: Row(**x)).toDF()
    columns = df.schema.names
    conditions_mean = [
        _mean(col(column)).alias(column + "_mean") for column in columns
    ]
    conditions_std = [
        _stddev(col(column)).alias(column + "_stddev") for column in columns
    ]

    df = df.select(conditions_mean + conditions_std).toPandas()
    df["time_stamp"] = time.timestamp() * 1000

    if id in streaming_dict:
        streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True)
    else:
        streaming_dict[id] = df
def create_parquet_files(spark_session):
    """
    Create a histogram for each feature

    Args:
        spark_session: dataframe to be processed

    Returns:
        dataframe
    """
    all_features = spark_session.read.parquet('/user/***REMOVED***/StackOverflow/output_stackoverflow.parquet')
    all_features = all_features.filter(all_features['is_question'])

    all_results = []

    for feature in INTEGER_FEATURES + FLOAT_FEATURES + BOOLEAN_FEATURES:

        # Replace all outliers with z-score above 2 with -1, unless they're boolean
        if feature not in BOOLEAN_FEATURES:
            stats = all_features \
                .select(_mean(col(feature)).alias('mean'), stddev(col(feature)).alias('std')) \
                .collect()
            mean = stats[0]['mean']
            std = stats[0]['std']

            all_features = all_features.withColumn(feature,
                                                   when(abs((col(feature) - mean) / std) < 3,
                                                        col(feature)).otherwise(-1))

        if feature in FLOAT_FEATURES:
            # Bucketize each float feature into rounded number buckets
            all_features = all_features.withColumn(feature, round(col(feature), 2))

        for resolved in [True, False]:
            new_file = all_features.filter(col('has_answer') == resolved) \
                .select(feature) \
                .groupBy(feature).count()

            filename = feature + '_1' if resolved else feature + '_0'
            new_file.write.mode('overwrite') \
                .parquet('/user/***REMOVED***/StackOverflow/swashbuckler/output_' + filename + '.parquet')

    return all_results
示例#17
0
def prepareData():
    df_newOpiFac = newOpioidFactor()
    df_newOpiFac.show()
    df_AvgInc = homelessZScore()

    df_stats = df_newOpiFac.select(
        _mean(col('new_opioid_factor')).alias('mean'),
        _stddev(col('new_opioid_factor')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = df_newOpiFac.select(
        (df_newOpiFac['new_opioid_factor'] - mean) / std).withColumnRenamed(
            "((new_opioid_factor - 7447.090505536551) / 12252.01952320687)",
            "z_score_opioid").alias("z_score_opioid")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df_AvgInc.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    return final_df
# df.printSchema()

df = df.select(' _dewptm', ' _fog',' _pressurem', ' _rain', ' _tempm', \
                ' _thunder', ' _vism', ' _wdird', ' _wspdm', ' _conds')

cols = df.columns

stages = []

# df.groupBy(" _conds") \
#     .count() \
#     .orderBy(col("count").desc()) \
#     .show()

df_stats = df.select(
    _mean(col(' _vism')).alias('mean_vism'),
    _mean(col(' _wdird')).alias('mean_wdird'),
    _mean(col(' _wspdm')).alias('mean_wspdm'),
).collect()

mean_vism = df_stats[0]['mean_vism']
mean_wdird = df_stats[0]['mean_wdird']
mean_wspdm = df_stats[0]['mean_wspdm']

df = df.fillna({' _vism': mean_vism})
df = df.fillna({' _wdird': mean_wdird})
df = df.fillna({' _wspdm': mean_wspdm})

# print(mean_vism , mean_wdird , mean_wspdm)

# numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'float'or t[1] == 'double']
示例#19
0
# columns to drop
cols_to_drop = {
    'CustomerID', 'ThreewayCalls', 'CurrentEquipementDays',
    'HandsetRefurbished', 'TruckOwner', 'RVOwner', 'Homeownership',
    'BuysViaMailOrder', 'NotNewCellphoneUser', 'OwnsMotorcycle'
}

# Drop columns
churn = drop_columns(churn, cols_to_drop)

# Deal with missing values
churn = churn.filter(churn.ServiceArea.isNotNull())

# HandsetPrice
handset_mean = churn.select(_mean("HandsetPrice").alias("mean")).first()[0]
churn = churn.withColumn(
    "HandsetPrice",
    when(churn["HandsetPrice"] == "Unknown",
         handset_mean).otherwise(churn["HandsetPrice"]))

# Get rid of nulls
null_dict = find_null_counts(churn)
null_set = set(null_dict.keys())
churn = remove_nulls(churn, null_set)

# Columns to cast to different type
string_columns = {
    "Churn", "ServiceArea", "ChildrenInHH", "HandsetWebCapable",
    "RespondsToMailOffers", "OptOutMailings", "NonUSTravel", "OwnsComputer",
    "HasCreditCard", "NewCellphoneUser", "MadeCallToRetentionTeam",
示例#20
0
def main(
    output_folder="./www/stepchain",
    start_date=None,
    end_date=None,
    last_n_days=15,
):
    """Get step data in wmarchive.

    Each step array contains multiple steps. Udf function returns each step as a separate row in a list.
    flatMap helps to flat list of steps to become individual rows in dataframe.
    """
    # Borrowed logic from condor_cpu_efficiency
    _yesterday = datetime.combine(date.today() - timedelta(days=1),
                                  datetime.min.time())
    if not (start_date or end_date):
        end_date = _yesterday
        start_date = end_date - timedelta(days=last_n_days)
    elif not start_date:
        start_date = end_date - timedelta(days=last_n_days)
    elif not end_date:
        end_date = min(start_date + timedelta(days=last_n_days), _yesterday)
    if start_date > end_date:
        raise ValueError(
            f"start date ({start_date}) should be earlier than end date({end_date})"
        )

    spark = get_spark_session()
    df_raw = spark.read.option("basePath", _DEFAULT_HDFS_FOLDER).json(
        get_candidate_files(start_date, end_date, spark, base=_DEFAULT_HDFS_FOLDER)
    ) \
        .select(["data.*", "metadata.timestamp"]) \
        .filter(
        f"""data.meta_data.jobstate='success'
                  AND data.meta_data.jobtype='Production'
                  AND data.wmats >= {start_date.timestamp()}
                  AND data.wmats < {end_date.timestamp()}
                  """
    )
    df_rdd = df_raw.rdd.flatMap(lambda r: udf_step_extract(r))
    df = spark.createDataFrame(df_rdd,
                               schema=get_schema()).dropDuplicates().where(
                                   _col("ncores").isNotNull()).cache()
    df_details = df.groupby(["task", "site", "step_name"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
        _collect_set("acquisitionEra").alias("acquisitionEra"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    df_task = df.groupby(["task"]).agg(
        (100 * (_sum("jobCPU") / _mean("nthreads")) /
         _sum("jobTime")).alias("avg_cpueff"),
        _count(lit(1)).alias("#jobs"),
        _mean("steps_len").alias("#steps"),
        _mean("nthreads").alias("#nthreads"),
        _mean("ncores").alias("#ncores"),
        (_sum("jobCPU") / _count(lit(1))).alias("avg_jobCPU"),
        (_sum("jobTime") / _count(lit(1))).alias("avg_jobTime"),
    ).withColumn("avg_cpueff",
                 _col("avg_cpueff").cast(IntegerType())).toPandas()
    write_htmls(df_details, df_task, start_date, end_date, output_folder)
示例#21
0
# 3. DATA PREPROCESSING
# quit Name, PassengerID and Ticket (identifiers, Not attributes)
df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch',
               'Survived', 'Cabin')

# Dealing with missing values
# CABIN: Quit Cabin. The 77.1% of the values are null.
df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch',
               'Survived')

# AGE: Substitute by Mean
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col

df_stats = df.select(
    _mean(col('Age')).alias('mean'),
    _stddev(col('Age')).alias('std')).collect()
mean = df_stats[0]['mean']
std = df_stats[0]['std']
# mean=29.7=30
df = df.fillna(30, subset=['Age'])

# FARE: Substitute by Mean
df_stats = df.select(_mean(col('Fare')).alias('mean')).collect()
mean = df_stats[0]['mean']
# mean=33.2955
df = df.fillna(33.2955, subset=['Fare'])

# EMBARKED: Substitute by most common class
df.groupBy(df['Embarked']).count().show()
# The most common class is 'S'

# Standardization : Operation to transform a feature so that
# mean = 0 and standard deviation = 1 for
# the transformed aray of values
def standardize(x, meanVal, stdVal):
    return (x - meanVal) / stdVal


def standardize_udf(meanVal, stdVal):
    return udf(lambda x: standardize(x, meanVal, stdVal), FloatType())


# extract mean and standard deviation value for the column 'height_percentage'
df_stats_hp = df.select(
    _mean(col('height_percentage')).alias('mean'),
    _stddev(col('height_percentage')).alias('std')).collect()

mean_hp = df_stats_hp[0]['mean']
std_hp = df_stats_hp[0]['std']

# extract mean and standard deviation value for the column 'age'
df_stats_age = df.select(
    _mean(col('age')).alias('mean'),
    _stddev(col('age')).alias('std')).collect()

mean_age = df_stats_age[0]['mean']
std_age = df_stats_age[0]['std']

# perform simple standardization on the 'age' and 'height_percentage' column
df = df.withColumn('height_percentage',
示例#23
0
    'INDUS_outliers', 'INDUS').show()
bounded_df.filter(bounded_df.CHAS_outliers != 0).select(
    'CHAS_outliers', 'CHAS').show()
bounded_df.filter(bounded_df.NOX_outliers != 0).select('NOX_outliers',
                                                       'NOX').show()
bounded_df.filter(bounded_df.RM_outliers != 0).select('RM_outliers',
                                                      'RM').show()
'''
Spark not support visualization to data.
here i used databricks display method to visualize the contrnt

'''
display(
    bounded_df.select('CRIM_outliers', 'ZN_outliers', 'INDUS_outliers',
                      'CHAS_outliers', 'NOX_outliers', 'RM_outliers',
                      'AGE_outliers', 'DIS_outliers', 'RAD_outliers',
                      'TAX_outliers', 'PTRATIO_outliers', 'B_outliers',
                      'LSTAT_outliers', 'PRICE_outliers'))

#Calculating Z - score
val = df.select(df.INDUS.cast("int"))
df_stats = df.select(
    _mean(df.INDUS.cast("double")).alias('mean'),
    _stddev(df.INDUS.cast("double")).alias('std'),
).collect()
#Add score_INDUS in dataframe
mean = df_stats[0]['mean']
std = df_stats[0]['std']
score_INDUS = df.withColumn("z score_INDUS",
                            df.INDUS.cast("double") - mean / std)
score_INDUS.show()
              str(processCount) + ' - ' + filename)
        # Read file to dataset and apply all regex functions
        found_type = []
        fileinfo = []
        regex_res = []
        df = sqlContext.read.format("csv").option(
            "header", "false").option("inferSchema", "true").option(
                "delimiter",
                "\t").schema(customSchema).load(inputDirectory + filename)
        df_stats = mean_stdv(df)
        mean = df_stats[0]['mean']
        std = df_stats[0]['std']
        count_all = count_all_values(df)

        #added col_length which is the average length of the col
        df_length = df.select(_mean(length(col("val"))).alias('avg_length'))
        col_length = df_length.collect()[0][0]

        percentage_website, found_type, type_count_web = re_find_website(
            df, count_all, found_type)
        percentage_zip, found_type, type_count_zip = re_find_zipCode(
            df, count_all, found_type)
        percentage_buildingCode, found_type, type_count_building = re_find_buildingCode(
            df, count_all, found_type)
        percentage_phoneNum, found_type, type_count_phone = re_find_phoneNum(
            df, count_all, found_type)
        percentage_lat_lon, found_type, type_count_lat_lon = re_find_lat_lon(
            df, count_all, found_type)
        percentage_add_st, found_type, type_count_add_st = re_find_street_address(
            df, count_all, col_length, found_type)
        percentage_school_name, found_type, type_count_school_name = re_find_school(
    # print schema
    print(init_flat_data.printSchema(), '\n')

    # calculate min and max or order date in order to calculate recency
    max_order_date, min_order_date = init_flat_data \
        .select( _max(col('order_date')), _min(col('order_date'))) \
        .take(1)[0]

    # calculate recency/frequency and monetary
    calculate_diff_day = udf(lambda x: (max_order_date - x).days,
                             IntegerType())
    rfm_table = init_flat_data \
        .withColumn('recency', calculate_diff_day('order_date')) \
        .groupby(['company_id', 'company_name', 'country']) \
        .agg(
            _mean(col('recency')).alias('recency'),
            _count(col('order_id')).alias('frequency'),
            _sum(col('NBI')).alias('monetary')
        )

    # calculate quantiles for each variable
    quantiles = rfm_table.approxQuantile(['recency', 'frequency', 'monetary'],
                                         [0.20, 0.4, 0.6, 0.8], 0)
    r_quantile = quantiles[0]
    f_quantile = quantiles[1]
    m_quantile = quantiles[2]

    # calculate score of each variable
    def_r_score = udf(
        lambda x: 5 if x < r_quantile[0] else 4 if x < r_quantile[1] else 3
        if x < r_quantile[2] else 2 if x < r_quantile[3] else 1, IntegerType())
示例#26
0
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as _sum, udf, col, mean as _mean
from pyspark.sql.types import StringType

import time

spark = SparkSession.builder.appName('luigi').getOrCreate()
# avoid crating a new spark session everytime the code is executed

dfG = spark.read.csv('/data/ethereum/transactions',
                     header=True).select('block_timestamp',
                                         'gas').orderBy("block_timestamp")
# take the csv without the header, rename columns 2 ("to_address") and 3 ("value") renaming them c2 and c0, group over c0 summing c3

udfG = udf(lambda x: time.strftime("%Y %m", time.gmtime(x)),
           StringType())  #Define UDF function

dfG = dfG.withColumn(
    'time', udfG(col('block_timestamp').cast("integer"))).groupBy("time").agg(
        _mean("gas")).orderBy("time")
#dfG.show()
dfG.repartition(1).write.csv("outCSpark", sep=",", header=True)
示例#27
0
def get_et0(df_in):
    # Efetua o agrupamento dos valores, efetuando a agregacao de determinados campos
    print('CONSTRUINDO DATAFRAME COM VALORES AGREGADOS')
    df_out = df_in.groupby('city', 'yr', 'mo').agg(
        _mean('ET0').alias('ET0_MES')).orderBy('city', 'yr', 'mo')
    return df_out
示例#28
0
                            (dmin is not null AND trim('dmin') != "") AND
                            (hmdy is not null AND trim('hmdy') != "") AND
                            (hmax is not null AND trim('hmax') != "") AND
                            (hmin is not null AND trim('hmin') != "") AND
                            (wdsp is not null AND trim('wdsp') != "") AND
                            (wdct is not null AND trim('wdct') != "") AND
                            (gust is not null AND trim('gust') != "")
                        ''')

print(df_clima_new.count())
print(df_clima.count())

# Remove os valores NULL ou '' dos campos de valor

df_clima_agg = df_clima.groupby('city', 'yr').agg(
    _mean('prcp').alias('prcp_mean'),
    _mean('stp').alias('stp_mean'),
    _mean('smax').alias('smax_mean'),
    _mean('smin').alias('smin_mean'),
    _mean('gbrd').alias('gbrd_mean'),
    _mean('temp').alias('temp_mean'),
    _mean('dewp').alias('dewp_mean'),
    _mean('tmax').alias('tmax_mean'),
    _mean('dmax').alias('dmax_mean'),
    _mean('tmin').alias('tmin_mean'),
    _mean('dmin').alias('dmin_mean'),
    _mean('hmdy').alias('hmdy_mean'),
    _mean('hmax').alias('hmax_mean'),
    _mean('hmin').alias('hmin_mean'),
    _mean('wdsp').alias('wdsp_mean'),
    _mean('wdct').alias('wdct_mean'),