def main():

    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_party_ids = imputed_df.select("party_id").distinct().count()

    num_rows = imputed_df.count()

    num_party_ids_with_positive_outcome = get_num_party_ids_with_positive_outcome(
        imputed_df)

    result = spark.createDataFrame(
        [['Number of customers', num_party_ids], ['Number of rows', num_rows],
         [
             'Number of customers with a positive outcome',
             num_party_ids_with_positive_outcome
         ]], ['', 'Value'])
    return result
def get_test_df():
    """
    Creating test data frame 
    """
    spark = start_spark_session()

    return spark.createDataFrame([
        (1, 1, 0),
        (1, 2, 0),
        (1, 3, 0),
        (1, 4, 1),
        (1, 5, 0),
        (2, 1, 0),
        (2, 2, 0),
        (2, 3, 1),
        (3, 1, 1),
        (3, 2, 1),
        (3, 3, 1),
        (4, 1, 1),
        (4, 2, 1),
        (4, 3, 1),
        (4, 4, 0),
        (5, 1, 1),
        (5, 2, 1),
        (5, 3, 1),
        (5, 4, 0),
        (5, 5, 0),
        (6, 1, 1),
        (6, 2, 1),
        (6, 3, 1),
        (6, 4, 0),
        (6, 5, 0),
        (7, 1, 0),
        (7, 2, 0),
        (6, 3, 0),
        (6, 4, 0),
        (6, 5, 0),
    ], ['party_id', 'ctu', 'imputed_ctu'])
def main():
    """
    1. For every party ID caclulate number of CTU's imputed
    2. Calculate number of distinct CTU's per party id
    3. Devide number of CTU's imputed by distinct number of CTUs
    4. Create buckets 0.00% -24%, 25% - 49%, 50 - 74%, 75% - 98%, 99% -100 
    in every bucket all ther partyid those that fit inside thier backet
    5. Caclulcate proportion of partyids that fit inside the bucket out of total partyids
        Calculate proportion of accounts that have more than:	
        99% missing	0.20
        75% missing	0.40
        50% missing	0.60
        25% missing	0.70

    """
    spark = start_spark_session()
    imputed_df = get_imputed_df(cfg.IMPUTATION_TRAIN_PATH,
                                cfg.IMPUTATION_PREDICT_PATH)

    num_imputed_ctus_per_partyid = get_num_imputed_ctus_per_partyid(imputed_df)
    num_distinct_ctus_per_partyid = get_num_distinct_ctus_per_partyid(
        imputed_df)
    joined_num_distinct_imputed_ctus_df = join(num_distinct_ctus_per_partyid,\
                                               num_imputed_ctus_per_partyid)
    percentage_of_missing_ctus_per_partyid = \
                get_percentage_of_missing_ctus_per_party_id(
                        joined_num_distinct_imputed_ctus_df )

    party_id_and_its_bucket = create_buckets(
        percentage_of_missing_ctus_per_partyid)
    num_partyids_with_missing_ctus_per_backet = get_num_partyids_per_backet(
        party_id_and_its_bucket)
    total_num_ids = imputed_df.groupby("party_id").count().count()
    result_df = calculate_proportion_of_missing_ctus_per_percentile ( spark, num_partyids_with_missing_ctus_per_backet, \
                                          total_num_ids )
    write_to_excel(result_df, "zone_5_ctu_imp_ste_6_miss_ctus")
4. diffrence delts between columns : get_delta_columns_df(joined_df):
5. Caclulate KS divergence and kl divergence
6. K test
7. join delta_df, k_test, kl_divergence
"""

import sys
sys.path.append("/home/boldyrek/mysoft/te/te_reporting/")
from col_stats import *
import config as cfg
from helper_functions import *
from scipy import stats
from helper_functions import start_spark_session, get_imputed_df, suffix_and_join_dfs, write_to_excel, load_df


spark = start_spark_session()

# step 1 loading dfs
imputed_train = load_df( cfg.SPLIT_TRAIN_PATH )
imputed_predict = load_df( cfg.SPLIT_PRED_PATH )

#imputed_train = test_df
#imputed_predict = test_df

# step 2 getting descriptive statistics
imputed_train_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark,  imputed_train )
imputed_predict_descriptive_stats = get_df_with_descriptive_stats_for_columns ( spark, imputed_predict )

#Step 3 join dfs
joined_descriptive_stats= suffix_and_join_dfs(
    imputed_train_descriptive_stats, imputed_predict_descriptive_stats, 'column_name' )