Python start_spark_session примеры использования

Язык программирования: Python

Пространство имен/Пакет: helper_functions

Метод/Функция: start_spark_session

Примеров на hotexamples.com: 4

Python start_spark_session - 4 примера найдено. Это лучшие примеры Python кода для helper_functions.start_spark_session, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

Файл: zone_5_model_data_funnel_step_8.py Проект: boldyrek/te_reporting

def main():

    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_party_ids = imputed_df.select("party_id").distinct().count()

    num_rows = imputed_df.count()

    num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome(
        imputed_df)

    result = spark.createDataFrame(
        [['Number of customers', num_party_ids], ['Number of rows', num_rows],
         [
             'Number of customers with a positive outcome',
             num_party_ids_with_positive_outcome
         ]], ['', 'Value'])
    return result

Пример #2

Показать файл

Файл: zone_5_ctu_imputation_step_6_missing_values.py Проект: boldyrek/te_reporting

def get_test_df():
    """
    Creating test data frame 
    """
    spark = start_spark_session()

    return spark.createDataFrame([
        (1, 1, 0),
        (1, 2, 0),
        (1, 3, 0),
        (1, 4, 1),
        (1, 5, 0),
        (2, 1, 0),
        (2, 2, 0),
        (2, 3, 1),
        (3, 1, 1),
        (3, 2, 1),
        (3, 3, 1),
        (4, 1, 1),
        (4, 2, 1),
        (4, 3, 1),
        (4, 4, 0),
        (5, 1, 1),
        (5, 2, 1),
        (5, 3, 1),
        (5, 4, 0),
        (5, 5, 0),
        (6, 1, 1),
        (6, 2, 1),
        (6, 3, 1),
        (6, 4, 0),
        (6, 5, 0),
        (7, 1, 0),
        (7, 2, 0),
        (6, 3, 0),
        (6, 4, 0),
        (6, 5, 0),
    ], ['party_id', 'ctu', 'imputed_ctu'])

Пример #3

Показать файл

Файл: zone_5_ctu_imputation_step_6_missing_values.py Проект: boldyrek/te_reporting

def main():
    """
    1. For every party ID caclulate number of CTU's imputed
    2. Calculate number of distinct CTU's per party id
    3. Devide number of CTU's imputed by distinct number of CTUs
    4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 
    in every bucket all ther partyid those that fit inside thier backet
    5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids
        Calculate proportion of accounts that have more than:	
        99% missing	0.20
        75% missing	0.40
        50% missing	0.60
        25% missing	0.70

    """
    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df)
    num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid(
        imputed_df)
    joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\
                                               num_imputed_ctus_per_partyid)
    percentage_of_missing_ctus_per_partyid = \
                get_percentage_of_missing_ctus_per_party_id(
                        joined_num_distinct_imputed_ctus_df )

    party_id_and_its_bucket = create_buckets(
        percentage_of_missing_ctus_per_partyid)
    num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet(
        party_id_and_its_bucket)
    total_num_ids = imputed_df.groupby("party_id").count().count()
    result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \
                                          total_num_ids )
    write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")

Пример #4

Показать файл

Файл: zone_5_splitting_feature_statistics_step_9.py Проект: boldyrek/te_reporting

4. diffrence delts between columns : get_delta_columns_df(joined_df):
5. Caclulate KS divergence and kl divergence
6. K test
7. join delta_df, k_test, kl_divergence
"""

import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from col_stats import *
import config as cfg
from helper_functions import *
from scipy import stats
from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df


spark = start_spark_session()

# step 1 loading dfs
imputed_train = load_df( cfg.SPLIT_TRAIN_PATH )
imputed_predict = load_df( cfg.SPLIT_PRED_PATH )

#imputed_train = test_df
#imputed_predict = test_df

# step 2 getting descriptive statistics
imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark,  imputed_train )
imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict )

#Step 3 join dfs
joined_descriptive_stats= suffix_and_join_dfs(
    imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' )