# extract_features_history < - function(dt, ref_ids_escalated) from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.utils.reader_writers.reader_writers import ( SparkRead, SparkWrite ) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() milestone_features = spark_read.parquet( path=data_dir.make_feature_path('milestone') ) print(f'rows in milestone features {milestone_features.count()}') comments_features = spark_read.parquet( path=data_dir.make_feature_path('comments') ) print(f'rows in milestone features {comments_features.count()}') case_status_history_features = spark_read.parquet( path=data_dir.make_feature_path('case_status_history_features') ) print(f'rows in case_status_history features {case_status_history_features.count()}') metadata_features = spark_read.parquet( path=data_dir.make_feature_path('metadata') )
first_last_history_features.show() first_last_history_features.count() base_table_case_status_history_features.show() case_status_history_features = ( first_last_history_features # .join( # aggregated_history_features, # on=['reference_id'], # how='inner' # ) ) case_status_history_features.filter(F.col('reference_id') == 100052).show() spark_write.parquet( case_status_history_features, path=data_dir.make_feature_path('case_status_history_features'), n_partitions=10) case_status_history_features = spark_read.parquet( path=data_dir.make_feature_path('case_status_history_features')) case_status_history_features.show() print( f'number of rows in case status history features {case_status_history_features.count()}' ) # 52968 print(len(case_status_history_features.columns))
# response_encoded_metadata_features = encode_categorical_using_mean_response_rate_inplace( # df=response_encoded_metadata_features, # categorical_column="first_label_encoded_comment_type", # response_column='response' # ) ### Mean encoding done comments_features = (base_table_case_status_history_features.join( response_encoded_metadata_features.select( *['reference_id', 'seconds_since_case_start'] + mean_encoded_columns), on=['reference_id', 'seconds_since_case_start'], how='inner').select(*['reference_id'] + non_distinct_columns).drop_duplicates()) spark_write.parquet(df=comments_features, path=data_dir.make_feature_path('comments'), n_partitions=10) comments_features = spark_read.parquet( path=data_dir.make_feature_path('comments')) comments_features.show() print(f'rows in milestone features {comments_features.count()}') # 52967 print(f'columns in milestone features {len(comments_features.columns)}') # next_to_last_comment_type = as.character(first(last(COMMENT_TYPE, 2))), # next_to_last_comment_note_length = length(strsplit(first(last(NOTES, 2)), " ")[[1]]), # mean_comment_note_length = mean(sapply(strsplit(NOTES, " "), length)), # unique_terms_in_comment_note = uniqueN(unlist(strsplit(NOTES, " "))), # unique_terms_in_last_comment_note = uniqueN(strsplit(last(NOTES), " ")[[1]]) ), by = REFERENCEID]
metadata_features = (metadata.select( 'reference_id', F.col('cloud').alias('is_cloud'), F.col('premium_code'), F.col('is_premium'), F.col('primary_product_family_id'), F.col('customer_phase')).fillna(value=0, subset=['is_cloud']).join( response_encoded_metadata_features.select(*['reference_id'] + mean_encoded_columns), on=['reference_id'], how='inner').join(metadata_is_in_escalated, on=['reference_id'], how='inner').join(metadata_label_encoded, on=['reference_id'], how='inner').join( metadata_is_dummy_encoded, on=['reference_id'], how='inner')) spark_write.parquet(df=metadata_features, path=data_dir.make_feature_path('metadata'), n_partitions=10) metadata_features = spark_read.parquet( path=data_dir.make_feature_path('metadata')) metadata_features.show() print(f" metadata has {metadata_features.count()} rows") print(f" metadata has {len(metadata_features.columns)} columns") metadata_features.groupby().agg( *[F.avg(col) for col in metadata_features.columns if 'dummy' in col]).show()
) ) .filter( F.col('rank_seconds_to_milestone_features') == 1 ) ) pivoted_milestone_ids_features.show() # pivoted_milestone_ids_features.filter(F.col('seconds_to_closest_milestone_features') != 0).show() # pivoted_milestone_ids_features.sample(False, 0.001).show() spark_write.parquet( df=pivoted_milestone_ids_features, path=data_dir.make_feature_path('pivoted_milestone_ids'), n_partitions=10 ) pivoted_milestone_ids_features = spark_read.parquet( data_dir.make_feature_path('pivoted_milestone_ids') ) pivoted_milestone_ids_features.show() print(f'pivoted_milestone_ids_features has {pivoted_milestone_ids_features.count()} rows') # # # import lightgbm as lgb # import numpy as np # import pandas as pd
F.col('reverse_order_row_number_seconds_since_case_start') == 1).join( response_encoded_metadata_features.select( *['reference_id', 'seconds_since_case_start'] + mean_encoded_columns), on=['reference_id', 'seconds_since_case_start'], how='inner').drop(*milestone_columns_to_drop).drop_duplicates().drop(*[ 'seconds_since_previous_milestone', 'length_of_milestone_description', 'is_milestone_note_6763', 'note_description_null_for_6763', 'length_of_milestone_note', 'length_of_milestone_note_null_for_6763', 'label_encoded_updated_by', 'label_encoded_milestone_description', 'last_milestone_description' ])) spark_write.parquet(df=milestone_features, path=data_dir.make_feature_path('milestone'), n_partitions=10) milestone_features = spark_read.parquet( path=data_dir.make_feature_path('milestone')) milestone_features.show() print(f'rows in milestone features {milestone_features.count()}' ) # 52966 (approx) - again - 52966 print(f'rows in milestone features {len(milestone_features.columns)}') # unique_terms_in_milestone_desc = uniqueN(unlist(strsplit(MILESTONEDESCRIPTION, " "))), # mean_unique_terms_in_milestone_desc = mean(sapply(strsplit(MILESTONEDESCRIPTION, " "), uniqueN)), # unique_terms_in_last_milestone_desc = uniqueN(strsplit(last(MILESTONEDESCRIPTION), " ")[[1]]), # # # next_to_last_milestone_note_length = length(strsplit(first(last(NOTEDESCRIPTION, 2)), " ")[[1]]), # unique_terms_in_milestone_notes = uniqueN(unlist(strsplit(NOTEDESCRIPTION, " "))),