示例#1
0
# extract_features_history < - function(dt, ref_ids_escalated)

from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.etl.constants_and_parameters import TIME_INTERVAL
from source.utils.ml_tools.categorical_encoders import encode_categorical_using_mean_response_rate_inplace
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

test = spark_read.parquet(path=data_dir.make_interim_path('test'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

history_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('history_with_cutoff_times'))

history_with_cutoff_times.show()

base_table_case_status_history_features = (
    history_with_cutoff_times.withColumn(
示例#2
0
from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.etl.constants_and_parameters import TIME_INTERVAL
from source.utils.ml_tools.categorical_encoders import (
    one_hot_encode_categorical, label_encode_categorical_inplace,
    encode_categorical_using_mean_response_rate_inplace)
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

comments = spark_read.parquet(path=data_dir.make_interim_path('comments'))
case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

ref_ids_escalated = (case_status_history.filter(
    F.col("inverse_time_to_next_escalation") > 0).select(
        'reference_id').distinct())
ref_ids_escalated.count()

comments_with_cutoff_times = spark_read.parquet(
    path=data_dir.make_processed_path('comments_with_cutoff_times'))

comments_with_cutoff_times.show()
comments_with_cutoff_times.groupby('comment_type').count().orderBy(
    F.desc('count')).show(n=100)
from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (
    SparkRead,
    SparkWrite
)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

response_file = spark_read.parquet(
    data_dir.make_interim_path('response_file')
)
milestones = spark_read.parquet(
    data_dir.make_interim_path('milestones')
)

milestones.orderBy('reference_id', 'seconds_since_case_start').show()

# EDA milestons
# milestone_id distributions
milestones.create_distribution(
    groupby_columns=['milestone_id'],
    numeric_column='seconds_since_case_start',
    include_count_distribution=True
).orderBy(F.desc('count')).show(n=70)
示例#4
0
from pyspark.sql import Window
from pyspark.sql import functions as F

from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (SparkRead, SparkWrite)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

case_status_history = spark_read.parquet(
    path=data_dir.make_interim_path('case_status_history'))

test = spark_read.parquet(path=data_dir.make_interim_path('test')).withColumn(
    'inverse_time_to_next_escalation',
    F.col('inverse_time_to_next_escalation').cast('double'))

milestones = spark_read.parquet(path=data_dir.make_interim_path('milestones'))

comments = spark_read.parquet(path=data_dir.make_interim_path('comments'))

case_status_history.show()

escalation_starts = (case_status_history.filter(
    F.col('is_escalate') == 'Y').groupby('reference_id').agg(
        F.min('seconds_since_case_start').alias('escalation_start'),
        F.max('seconds_since_case_start').alias('case_end')))

escalation_starts.count()  # 646
escalation_starts.filter(F.col('reference_id') == 100087).show()
示例#5
0
# from config.spark_setup import launch_spark

from pyspark.sql import functions as F
from config.data_paths import data_dir
from config.env import *
from source.utils.reader_writers.reader_writers import (
    SparkRead,
    SparkWrite
)

spark_read = SparkRead(spark=spark)
spark_write = SparkWrite()

response_file = spark_read.parquet(
    data_dir.make_interim_path('response_file')
)

response_file.orderBy('reference_id', 'seconds_since_case_start').show()

pivoted_milestone_ids_features = spark_read.parquet(
    path=data_dir.make_feature_path('pivoted_milestone_ids'),
)

pivoted_milestone_ids_features.show()

print(f"rows in response file {response_file.count()}")  # 720938
print(f"rows in pivoted milestone features {pivoted_milestone_ids_features.count()}")

model_file = (
    response_file