# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (
    SparkRead,
    SparkWrite
)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

milestone_features = spark_read.parquet(
    path=data_dir.make_feature_path('milestone')
)
print(f'rows in milestone features {milestone_features.count()}')
comments_features = spark_read.parquet(
    path=data_dir.make_feature_path('comments')
)
print(f'rows in milestone features {comments_features.count()}')

case_status_history_features = spark_read.parquet(
    path=data_dir.make_feature_path('case_status_history_features')
)
print(f'rows in case_status_history features {case_status_history_features.count()}')

metadata_features = spark_read.parquet(
    path=data_dir.make_feature_path('metadata')
)
Exemplo n.º 2
0
first_last_history_features.show()
first_last_history_features.count()

base_table_case_status_history_features.show()

case_status_history_features = (
    first_last_history_features

    #     .join(
    #     aggregated_history_features,
    #     on=['reference_id'],
    #     how='inner'
    # )
)

case_status_history_features.filter(F.col('reference_id') == 100052).show()

spark_write.parquet(
    case_status_history_features,
    path=data_dir.make_feature_path('case_status_history_features'),
    n_partitions=10)

case_status_history_features = spark_read.parquet(
    path=data_dir.make_feature_path('case_status_history_features'))

case_status_history_features.show()
print(
    f'number of rows in case status history features {case_status_history_features.count()}'
)  # 52968
print(len(case_status_history_features.columns))
Exemplo n.º 3
0
# response_encoded_metadata_features = encode_categorical_using_mean_response_rate_inplace(
#     df=response_encoded_metadata_features,
#     categorical_column="first_label_encoded_comment_type",
#     response_column='response'
# )

### Mean encoding done

comments_features = (base_table_case_status_history_features.join(
    response_encoded_metadata_features.select(
        *['reference_id', 'seconds_since_case_start'] + mean_encoded_columns),
    on=['reference_id', 'seconds_since_case_start'],
    how='inner').select(*['reference_id'] +
                        non_distinct_columns).drop_duplicates())

spark_write.parquet(df=comments_features,
                    path=data_dir.make_feature_path('comments'),
                    n_partitions=10)

comments_features = spark_read.parquet(
    path=data_dir.make_feature_path('comments'))
comments_features.show()
print(f'rows in milestone features {comments_features.count()}')  # 52967
print(f'columns in milestone features {len(comments_features.columns)}')

# next_to_last_comment_type = as.character(first(last(COMMENT_TYPE, 2))),
# next_to_last_comment_note_length = length(strsplit(first(last(NOTES, 2)), " ")[[1]]),
# mean_comment_note_length = mean(sapply(strsplit(NOTES, " "), length)),
# unique_terms_in_comment_note = uniqueN(unlist(strsplit(NOTES, " "))),
# unique_terms_in_last_comment_note = uniqueN(strsplit(last(NOTES), " ")[[1]]) ), by = REFERENCEID]
metadata_features = (metadata.select(
    'reference_id',
    F.col('cloud').alias('is_cloud'), F.col('premium_code'),
    F.col('is_premium'), F.col('primary_product_family_id'),
    F.col('customer_phase')).fillna(value=0, subset=['is_cloud']).join(
        response_encoded_metadata_features.select(*['reference_id'] +
                                                  mean_encoded_columns),
        on=['reference_id'],
        how='inner').join(metadata_is_in_escalated,
                          on=['reference_id'],
                          how='inner').join(metadata_label_encoded,
                                            on=['reference_id'],
                                            how='inner').join(
                                                metadata_is_dummy_encoded,
                                                on=['reference_id'],
                                                how='inner'))

spark_write.parquet(df=metadata_features,
                    path=data_dir.make_feature_path('metadata'),
                    n_partitions=10)

metadata_features = spark_read.parquet(
    path=data_dir.make_feature_path('metadata'))

metadata_features.show()
print(f" metadata has {metadata_features.count()} rows")
print(f" metadata has {len(metadata_features.columns)} columns")

metadata_features.groupby().agg(
    *[F.avg(col) for col in metadata_features.columns
      if 'dummy' in col]).show()
        )
    )
        .filter(
        F.col('rank_seconds_to_milestone_features') == 1
    )

)

pivoted_milestone_ids_features.show()

# pivoted_milestone_ids_features.filter(F.col('seconds_to_closest_milestone_features') != 0).show()
# pivoted_milestone_ids_features.sample(False, 0.001).show()

spark_write.parquet(
    df=pivoted_milestone_ids_features,
    path=data_dir.make_feature_path('pivoted_milestone_ids'),
    n_partitions=10
)

pivoted_milestone_ids_features = spark_read.parquet(
    data_dir.make_feature_path('pivoted_milestone_ids')
)

pivoted_milestone_ids_features.show()
print(f'pivoted_milestone_ids_features has {pivoted_milestone_ids_features.count()} rows')

#
#
# import lightgbm as lgb
# import numpy as np
# import pandas as pd
    F.col('reverse_order_row_number_seconds_since_case_start') == 1).join(
        response_encoded_metadata_features.select(
            *['reference_id', 'seconds_since_case_start'] +
            mean_encoded_columns),
        on=['reference_id', 'seconds_since_case_start'],
        how='inner').drop(*milestone_columns_to_drop).drop_duplicates().drop(*[
            'seconds_since_previous_milestone',
            'length_of_milestone_description', 'is_milestone_note_6763',
            'note_description_null_for_6763', 'length_of_milestone_note',
            'length_of_milestone_note_null_for_6763',
            'label_encoded_updated_by', 'label_encoded_milestone_description',
            'last_milestone_description'
        ]))

spark_write.parquet(df=milestone_features,
                    path=data_dir.make_feature_path('milestone'),
                    n_partitions=10)

milestone_features = spark_read.parquet(
    path=data_dir.make_feature_path('milestone'))
milestone_features.show()
print(f'rows in milestone features {milestone_features.count()}'
      )  # 52966 (approx) - again - 52966
print(f'rows in milestone features {len(milestone_features.columns)}')

# unique_terms_in_milestone_desc = uniqueN(unlist(strsplit(MILESTONEDESCRIPTION, " "))),
# mean_unique_terms_in_milestone_desc = mean(sapply(strsplit(MILESTONEDESCRIPTION, " "), uniqueN)),
# unique_terms_in_last_milestone_desc = uniqueN(strsplit(last(MILESTONEDESCRIPTION), " ")[[1]]),
#
# # next_to_last_milestone_note_length = length(strsplit(first(last(NOTEDESCRIPTION, 2)), " ")[[1]]),
# unique_terms_in_milestone_notes = uniqueN(unlist(strsplit(NOTEDESCRIPTION, " "))),