# extract_features_history < - function(dt, ref_ids_escalated) from pyspark.sql import Window from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.etl.constants_and_parameters import TIME_INTERVAL from source.utils.ml_tools.categorical_encoders import encode_categorical_using_mean_response_rate_inplace from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() case_status_history = spark_read.parquet( path=data_dir.make_interim_path('case_status_history')) test = spark_read.parquet(path=data_dir.make_interim_path('test')) ref_ids_escalated = (case_status_history.filter( F.col("inverse_time_to_next_escalation") > 0).select( 'reference_id').distinct()) ref_ids_escalated.count() history_with_cutoff_times = spark_read.parquet( path=data_dir.make_processed_path('history_with_cutoff_times')) history_with_cutoff_times.show() base_table_case_status_history_features = ( history_with_cutoff_times.withColumn(
from pyspark.sql import Window from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.etl.constants_and_parameters import TIME_INTERVAL from source.utils.ml_tools.categorical_encoders import ( one_hot_encode_categorical, label_encode_categorical_inplace, encode_categorical_using_mean_response_rate_inplace) from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() comments = spark_read.parquet(path=data_dir.make_interim_path('comments')) case_status_history = spark_read.parquet( path=data_dir.make_interim_path('case_status_history')) ref_ids_escalated = (case_status_history.filter( F.col("inverse_time_to_next_escalation") > 0).select( 'reference_id').distinct()) ref_ids_escalated.count() comments_with_cutoff_times = spark_read.parquet( path=data_dir.make_processed_path('comments_with_cutoff_times')) comments_with_cutoff_times.show() comments_with_cutoff_times.groupby('comment_type').count().orderBy( F.desc('count')).show(n=100)
from pyspark.sql import Window from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.utils.reader_writers.reader_writers import ( SparkRead, SparkWrite ) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() response_file = spark_read.parquet( data_dir.make_interim_path('response_file') ) milestones = spark_read.parquet( data_dir.make_interim_path('milestones') ) milestones.orderBy('reference_id', 'seconds_since_case_start').show() # EDA milestons # milestone_id distributions milestones.create_distribution( groupby_columns=['milestone_id'], numeric_column='seconds_since_case_start', include_count_distribution=True ).orderBy(F.desc('count')).show(n=70)
from pyspark.sql import Window from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() case_status_history = spark_read.parquet( path=data_dir.make_interim_path('case_status_history')) test = spark_read.parquet(path=data_dir.make_interim_path('test')).withColumn( 'inverse_time_to_next_escalation', F.col('inverse_time_to_next_escalation').cast('double')) milestones = spark_read.parquet(path=data_dir.make_interim_path('milestones')) comments = spark_read.parquet(path=data_dir.make_interim_path('comments')) case_status_history.show() escalation_starts = (case_status_history.filter( F.col('is_escalate') == 'Y').groupby('reference_id').agg( F.min('seconds_since_case_start').alias('escalation_start'), F.max('seconds_since_case_start').alias('case_end'))) escalation_starts.count() # 646 escalation_starts.filter(F.col('reference_id') == 100087).show()
# from config.spark_setup import launch_spark from pyspark.sql import functions as F from config.data_paths import data_dir from config.env import * from source.utils.reader_writers.reader_writers import ( SparkRead, SparkWrite ) spark_read = SparkRead(spark=spark) spark_write = SparkWrite() response_file = spark_read.parquet( data_dir.make_interim_path('response_file') ) response_file.orderBy('reference_id', 'seconds_since_case_start').show() pivoted_milestone_ids_features = spark_read.parquet( path=data_dir.make_feature_path('pivoted_milestone_ids'), ) pivoted_milestone_ids_features.show() print(f"rows in response file {response_file.count()}") # 720938 print(f"rows in pivoted milestone features {pivoted_milestone_ids_features.count()}") model_file = ( response_file