async def cluster_by_purchase_behaivor(sbx,
                                       spark,
                                       user_cluster,
                                       date_int,
                                       kmeans=3):

    print('fetch data')
    df = await user_cluster.df_sbx_customer_purchase_behaivor(
        sbx, spark, date_int)
    total = df.count()
    best_score = total * 100
    best_df = None
    print('Creating clusters')
    clustering_types = [user_cluster.bisecting_means, user_cluster.k_means]
    total_clustring_types = len(clustering_types)
    k = kmeans
    for i in range(total_clustring_types):
        if i == total_clustring_types - 1:
            model = await clustering_types[i](df, k, 1)
        else:
            model = await clustering_types[i](df, k)
        transformed = await user_cluster.run_cluster(model, df)
        grouped = transformed.groupBy("prediction").agg(
            countDistinct('customer').alias("count"))
        (std) = grouped.select(_stddev(col('count')).alias('std')).first()
        score = std['std']
        if (score < best_score):
            best_score = score
            best_df = transformed
        k = len(model.clusterCenters())



    return best_df.groupBy("prediction").agg(collect_list(col("customer")).alias("customers"),countDistinct('customer').alias("count"))\
        .sort("count", ascending=False)
예제 #2
0
def get_mean_and_std(all_harvest_df):
    # https://stackoverflow.com/a/47995478
    df_stats = all_harvest_df.select(
        _mean(col('yield')).alias('mean'),
        _stddev(col('yield')).alias('std')).collect()
    mean = df_stats[0]['mean']
    std = df_stats[0]['std']
    return mean, std
def mean_stdv(df):
    unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())
    for i in ["count"]:
        assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")
        scaler = MinMaxScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled")
        pipeline = Pipeline(stages=[assembler, scaler])
        df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")
        df_stats = df.select(_mean(col('count_Scaled')).alias('mean'),_stddev(col('count_Scaled')).alias('std')).collect()
        # mean = df_stats[0]['mean']
        # std = df_stats[0]['std']
        return df_stats 
    async def df_sbx_customer_special_box_purchased(self, sbx, spark):
        data = await  sbx.with_model('cart_box') \
            .set_page_size(1000) \
            .and_where_is_not_null('purchase') \
            .and_where_is_equal('variety', os.environ['SPECIAL_BOX']).find()
        sc = spark.sparkContext

        def deleteMeta(d):
            dt = {}
            dt['customer'] = d['customer']
            dt['total_items'] = d['total_items']
            dt['current_percentage'] = d['current_percentage']
            dt['count'] = 1
            return dt

        dit = list(map(deleteMeta, data['results']))
        tmp = sc.parallelize(dit, numSlices=100)
        df = spark.read.option("multiLine", "true").json(tmp)
        df2 = df.groupBy("customer").agg(
            func.avg("total_items").alias('total_items'),
            func.avg("current_percentage").alias('current_percentage'),
            func.sum("count").alias('count'))

        (cumean, custd, comean, costd, tmean, tstd) = df2.select(
            _mean(col('current_percentage')).alias('cumean'),
            _stddev(col('current_percentage')).alias('custd'),
            _mean(col('count')).alias('comean'),
            _stddev(col('count')).alias('costd'),
            _mean(col('total_items')).alias('total_items'),
            _stddev(col('total_items')).alias('total_items'),
        ).first()
        df3 = df2.withColumn(
            "acurrent_percentage",
            (col("current_percentage") - cumean) / custd).withColumn(
                "acount", (col("count") - comean) / costd).withColumn(
                    "atotal_items", (col("total_items") - tmean) / tstd)
        vecAssembler = VectorAssembler(
            inputCols=["acurrent_percentage", "acount", "atotal_items"],
            outputCol="features")
        return vecAssembler.transform(df3)
예제 #5
0
def incomeZScore():
    df_stats = dataset.select(
        _mean(col('Average_Income')).alias('mean'),
        _stddev(col('Average_Income')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Average_Income'] - mean) / std).withColumnRenamed(
            "((Average_Income - 58348.17333333333) / 9095.510688184871)",
            "z_score_AvgInc").alias("z_score_AvgInc")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = dataset.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    return final_df
예제 #6
0
def AverageAndStd(time, rdd, streaming_dict, id):
    if rdd.isEmpty():
        return
    df = rdd.map(lambda x: Row(**x)).toDF()
    columns = df.schema.names
    conditions_mean = [
        _mean(col(column)).alias(column + "_mean") for column in columns
    ]
    conditions_std = [
        _stddev(col(column)).alias(column + "_stddev") for column in columns
    ]

    df = df.select(conditions_mean + conditions_std).toPandas()
    df["time_stamp"] = time.timestamp() * 1000

    if id in streaming_dict:
        streaming_dict[id] = streaming_dict[id].append(df, ignore_index=True)
    else:
        streaming_dict[id] = df
예제 #7
0
def crimeZScore():
    df2 = unemploymentZScore()
    df_stats = dataset.select(
        _mean(col('Crime_Percent')).alias('mean'),
        _stddev(col('Crime_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Crime_Percent'] - mean) / std).withColumnRenamed(
            "((Crime_Percent - 3.2683999999999975) / 0.8328317973490115)",
            "z_score_Crime")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
예제 #8
0
def unemploymentZScore():
    df2 = incomeZScore()
    df_stats = dataset.select(
        _mean(col('Unemployment_Percent')).alias('mean'),
        _stddev(col('Unemployment_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Unemployment_Percent'] - mean) / std).withColumnRenamed(
            "((Unemployment_Percent - 7.450666666666669) / 2.512157640140963)",
            "z_score_Unem")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
예제 #9
0
def homelessZScore():
    df2 = crimeZScore()
    df_stats = dataset.select(
        _mean(col('Homeless_Percent')).alias('mean'),
        _stddev(col('Homeless_Percent')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = dataset.select(
        (dataset['Homeless_Percent'] - mean) / std).withColumnRenamed(
            "((Homeless_Percent - 0.17706666666666662) / 0.09455791640084463)",
            "z_score_Homeless")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df2.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    final_df.show()
    return final_df
예제 #10
0
def prepareData():
    df_newOpiFac = newOpioidFactor()
    df_newOpiFac.show()
    df_AvgInc = homelessZScore()

    df_stats = df_newOpiFac.select(
        _mean(col('new_opioid_factor')).alias('mean'),
        _stddev(col('new_opioid_factor')).alias('std')).collect()

    mean = df_stats[0]['mean']
    std = df_stats[0]['std']

    df1 = df_newOpiFac.select(
        (df_newOpiFac['new_opioid_factor'] - mean) / std).withColumnRenamed(
            "((new_opioid_factor - 7447.090505536551) / 12252.01952320687)",
            "z_score_opioid").alias("z_score_opioid")
    df11 = df1.withColumn("columnindex", monotonically_increasing_id())
    df22 = df_AvgInc.withColumn("columnindex", monotonically_increasing_id())
    final_df = df22.join(df11, df22.columnindex == df11.columnindex,
                         'inner').drop(df11.columnindex).drop(df22.columnindex)
    return final_df
예제 #11
0
파일: S1.py 프로젝트: Enter26/spark-python
    df.show(5)
    #modyfikacja struktury danych oraz wyświetlenie zmodyfikowanego schematu
    df2 = df.withColumn("max_temp", df["max_temp"].cast(FloatType()))
    #pobranie czasu zakończenia ładowania daych
    end_load_time = datetime.datetime.now()
    #wykonanie zadania
    df2.filter((df2['latitude'] >= 41) & (df2['latitude'] <= 45)
               & (df2['longitude'] >= (-110)) & (df2['longitude'] <= (-104))
               & (df2['year'] >= 1970)).groupBy("year").mean('max_temp').sort(
                   'year').show(50)
    #odchylenie standardowe
    df2.filter((df2['latitude'] >= 41) & (df2['latitude'] <= 45)
               & (df2['longitude'] >= (-110)) & (df2['longitude'] <= (-104))
               & (df2['year'] >= 1970)).groupBy("year").mean(
                   'max_temp').sort('year').select(
                       _stddev(col('avg(max_temp)')).alias('std')).show(1)

    #pobranie czasu zakończenia wykonania programu/zadania
    end_time = datetime.datetime.now()
    sc.stop()
    #utworzenie zmiennych wynikowych
    time_load_data = end_load_time - start_time
    time_of_execution = end_time - end_load_time
    total_time = end_time - start_time
    #wyświetlenie wyników
    print("Time load data:")
    print(time_load_data)
    print("Time of execution:")
    print(time_of_execution)
    print("Total time:")
    print(total_time)
wikiCategoryFile = "gs://metcs777/wiki-categorylinks.csv.bz2"
wikiCategoryLinks = sc.textFile(wikiCategoryFile)
wikiCats = wikiCategoryLinks.map(lambda x: x.split(",")).map(
    lambda x: (x[0].replace('"', ''), x[1].replace('"', '')))
df = sqlContext.createDataFrame(wikiCats)

df.show()
"""
task3.1
"""
from pyspark.sql import functions as func
from pyspark.sql import DataFrameStatFunctions as statFunc
from pyspark.sql import functions as F
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col
df_cate = df.groupBy(df[1]).count()
max = df_cate.agg(func.max("count")).show()
avg = df_cate.agg(func.mean("count")).show()
med = F.expr('percentile_approx(count, 0.5)')
median = df_cate.agg(med.alias('med(count)')).show()
std = df_cate.select(_stddev(col('count')).alias('std')).show()
"""
task3.2
"""
top = df_cate.orderBy("count", ascending=[0]).show(10)
"""
task3.3
"""
top_cate = df_cate.orderBy("count", ascending=[0]).limit(10)
top_page = top_cate.join(df, df[1] == top_cate[0]).drop("count")
top_id = top_page.groupBy(top_page[1]).count()
print(top_id.select(top_id[0]).show())
예제 #13
0
    'INDUS_outliers', 'INDUS').show()
bounded_df.filter(bounded_df.CHAS_outliers != 0).select(
    'CHAS_outliers', 'CHAS').show()
bounded_df.filter(bounded_df.NOX_outliers != 0).select('NOX_outliers',
                                                       'NOX').show()
bounded_df.filter(bounded_df.RM_outliers != 0).select('RM_outliers',
                                                      'RM').show()
'''
Spark not support visualization to data.
here i used databricks display method to visualize the contrnt

'''
display(
    bounded_df.select('CRIM_outliers', 'ZN_outliers', 'INDUS_outliers',
                      'CHAS_outliers', 'NOX_outliers', 'RM_outliers',
                      'AGE_outliers', 'DIS_outliers', 'RAD_outliers',
                      'TAX_outliers', 'PTRATIO_outliers', 'B_outliers',
                      'LSTAT_outliers', 'PRICE_outliers'))

#Calculating Z - score
val = df.select(df.INDUS.cast("int"))
df_stats = df.select(
    _mean(df.INDUS.cast("double")).alias('mean'),
    _stddev(df.INDUS.cast("double")).alias('std'),
).collect()
#Add score_INDUS in dataframe
mean = df_stats[0]['mean']
std = df_stats[0]['std']
score_INDUS = df.withColumn("z score_INDUS",
                            df.INDUS.cast("double") - mean / std)
score_INDUS.show()
# Standardization : Operation to transform a feature so that
# mean = 0 and standard deviation = 1 for
# the transformed aray of values
def standardize(x, meanVal, stdVal):
    return (x - meanVal) / stdVal


def standardize_udf(meanVal, stdVal):
    return udf(lambda x: standardize(x, meanVal, stdVal), FloatType())


# extract mean and standard deviation value for the column 'height_percentage'
df_stats_hp = df.select(
    _mean(col('height_percentage')).alias('mean'),
    _stddev(col('height_percentage')).alias('std')).collect()

mean_hp = df_stats_hp[0]['mean']
std_hp = df_stats_hp[0]['std']

# extract mean and standard deviation value for the column 'age'
df_stats_age = df.select(
    _mean(col('age')).alias('mean'),
    _stddev(col('age')).alias('std')).collect()

mean_age = df_stats_age[0]['mean']
std_age = df_stats_age[0]['std']

# perform simple standardization on the 'age' and 'height_percentage' column
df = df.withColumn('height_percentage',
                   standardize_udf(mean_hp, std_hp)(col('height_percentage')))
예제 #15
0
# 3. DATA PREPROCESSING
# quit Name, PassengerID and Ticket (identifiers, Not attributes)
df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch',
               'Survived', 'Cabin')

# Dealing with missing values
# CABIN: Quit Cabin. The 77.1% of the values are null.
df = df.select('Age', 'Fare', 'Sex', 'Pclass', 'Embarked', 'SibSp', 'Parch',
               'Survived')

# AGE: Substitute by Mean
from pyspark.sql.functions import mean as _mean, stddev as _stddev, col

df_stats = df.select(
    _mean(col('Age')).alias('mean'),
    _stddev(col('Age')).alias('std')).collect()
mean = df_stats[0]['mean']
std = df_stats[0]['std']
# mean=29.7=30
df = df.fillna(30, subset=['Age'])

# FARE: Substitute by Mean
df_stats = df.select(_mean(col('Fare')).alias('mean')).collect()
mean = df_stats[0]['mean']
# mean=33.2955
df = df.fillna(33.2955, subset=['Fare'])

# EMBARKED: Substitute by most common class
df.groupBy(df['Embarked']).count().show()
# The most common class is 'S'
df = df.fillna('S', subset=['Embarked'])
예제 #16
0
    def drop_duplicate_cols(self):
        '''
        Needs optimization
        '''

        numerical_col = [
            i_val for (i_val, i_type) in self._data_frame.dtypes
            if i_type == 'int' or i_type == 'double' or i_type == 'float'
        ]
        categorical_col = [
            i_val for (i_val, i_type) in self._data_frame.dtypes
            if i_type == 'string'
        ]
        boolean_col = [
            i_val for (i_val, i_type) in self._data_frame.dtypes
            if i_type == 'boolean'
        ]

        categorical_list1 = []
        categorical_list2 = []
        numerical_list1 = []
        numerical_list2 = []
        boolean_list1 = []
        boolean_list2 = []
        remove_list = []

        for i in range(len(categorical_col) - 1):
            for j in range(i + 1, len(categorical_col)):
                if self._data_frame.groupby(
                        self._data_frame[categorical_col[i]]).count().collect(
                        ) == self._data_frame.groupby(
                            categorical_col[j]).count().collect():
                    if categorical_col[
                            j] == self._dataframe_context.get_result_column():
                        categorical_list1.append(categorical_col[i])
                        categorical_list2.append(categorical_col[j])
                    else:
                        categorical_list1.append(categorical_col[j])
                        categorical_list2.append(categorical_col[i])

        count_dict1 = dict(list(zip(categorical_list1, categorical_list2)))

        elements_list1 = []
        elements_list2 = []

        for k, v in list(count_dict1.items()):
            elements_list1 = self._data_frame.select(k)
            elements_list2 = self._data_frame.select(v)
            if elements_list1.collect() == elements_list2.collect():
                remove_list.append(k)

        for i in range(len(numerical_col) - 1):
            for j in range(i + 1, len(numerical_col)):
                df_col1_std = self._data_frame.select(
                    _stddev(col(
                        numerical_col[i])).alias('std')).collect()[0][0]
                df_col2_std = self._data_frame.select(
                    _stddev(col(
                        numerical_col[j])).alias('std')).collect()[0][0]
                if (df_col1_std == df_col2_std):
                    if numerical_col[
                            j] == self._dataframe_context.get_result_column():
                        numerical_list1.append(numerical_col[i])
                        numerical_list2.append(numerical_col[j])
                    else:
                        numerical_list1.append(numerical_col[j])
                        numerical_list2.append(numerical_col[i])

        count_dict2 = dict(list(zip(numerical_list1, numerical_list2)))

        elements_list3 = []
        elements_list4 = []

        for k, v in list(count_dict2.items()):
            elements_list3 = self._data_frame.select(k)
            elements_list4 = self._data_frame.select(v)
            if elements_list3.collect() == elements_list4.collect():
                remove_list.append(k)

        for i in range(len(boolean_col) - 1):
            for j in range(i + 1, len(boolean_col)):
                if self._data_frame.groupby(df[boolean_col[i]]).count(
                ).collect() == self._data_frame.groupby(
                        boolean_col[j]).count().collect():
                    if boolean_col[
                            j] == self._dataframe_context.get_result_column():
                        boolean_list1.append(boolean_col[i])
                        boolean_list2.append(boolean_col[j])
                    else:
                        boolean_list1.append(boolean_col[j])
                        boolean_list2.append(boolean_col[i])

        count_dict3 = dict(list(zip(boolean_list1, boolean_list2)))

        elements_list5 = []
        elements_list6 = []

        for k, v in list(count_dict3.items()):
            elements_list5 = self._data_frame.select(k)
            elements_list6 = self._data_frame.select(v)
            if elements_list5.collect() == elements_list6.collect():
                remove_list.append(k)

        # print(remove_list)
        # return remove_list

        self.removed_col = remove_list
        self._data_frame = self._data_frame.drop(*remove_list)
        return self._data_frame