def timeline_operator(pkl_path: str, input_table_spec: TableSpec): """Loads a pandas parquet, converts to pyspark, and uploads df to Hive. Then call the timeline operator. """ pd_df = pd.read_pickle(pkl_path) spark = get_spark_session() df = spark.createDataFrame(pd_df) input_name = f"{input_table_spec.table_name}{PRE_TIMELINE_SUFFIX}" df.createTempView(input_name) output_name = input_table_spec.table_name include_possible_actions = "possible_actions" in pd_df arg = { "startDs": "2019-01-01", "endDs": "2019-01-01", "addTerminalStateRow": True, "inputTableName": input_name, "outputTableName": output_name, "includePossibleActions": include_possible_actions, "percentileFunction": "percentile_approx", "rewardColumns": ["reward", "metrics"], "extraFeatureColumns": [], } call_spark_class(spark, class_name="Timeline", args=json.dumps(arg))
def identify_normalization_parameters( table_spec: TableSpec, column_name: str, preprocessing_options: PreprocessingOptions, seed: Optional[int] = None, ) -> Dict[int, NormalizationParameters]: """Get normalization parameters""" sqlCtx = get_spark_session() df = sqlCtx.sql(f"SELECT * FROM {table_spec.table_name}") df = create_normalization_spec_spark(df, column_name, preprocessing_options.num_samples, seed) rows = df.collect() normalization_processor = normalization_helper( max_unique_enum_values=preprocessing_options.max_unique_enum_values, quantile_size=preprocessing_options.quantile_size, quantile_k2_threshold=preprocessing_options.quantile_k2_threshold, skip_box_cox=preprocessing_options.skip_box_cox, skip_quantiles=preprocessing_options.skip_quantiles, feature_overrides=preprocessing_options.feature_overrides, allowedlist_features=preprocessing_options.allowedlist_features, assert_allowedlist_feature_coverage=preprocessing_options. assert_allowedlist_feature_coverage, ) return normalization_processor(rows)
def calc_custom_reward(df, custom_reward_expression: str): sqlCtx = get_spark_session() # create a temporary table for running sql temp_table_name = "_tmp_calc_reward_df" temp_reward_name = "_tmp_reward_col" df.createOrReplaceTempView(temp_table_name) df = sqlCtx.sql( f"SELECT *, CAST(COALESCE({custom_reward_expression}, 0) AS FLOAT)" f" as {temp_reward_name} FROM {temp_table_name}") return df.drop("reward").withColumnRenamed(temp_reward_name, "reward")
def query_data( self, input_table_spec: TableSpec, discrete_action: bool, actions: Optional[List[str]] = None, include_possible_actions=True, custom_reward_expression: Optional[str] = None, sample_range: Optional[Tuple[float, float]] = None, multi_steps: Optional[int] = None, gamma: Optional[float] = None, ) -> Dataset: """Perform reward calculation, hashing mdp + subsampling and other preprocessing such as sparse2dense. """ sqlCtx = get_spark_session() # pyre-ignore df = sqlCtx.sql(f"SELECT * FROM {input_table_spec.table_name}") df = set_reward_col_as_reward( df, custom_reward_expression=custom_reward_expression, multi_steps=multi_steps, gamma=gamma, ) df = hash_mdp_id_and_subsample(df, sample_range=sample_range) df = misc_column_preprocessing(df, multi_steps=multi_steps) df = state_and_metrics_sparse2dense( df, states=infer_states_names(df, multi_steps), metrics=infer_metrics_names(df, multi_steps), multi_steps=multi_steps, ) if discrete_action: assert include_possible_actions assert actions is not None, "in discrete case, actions must be given." df = discrete_action_preprocessing(df, actions=actions, multi_steps=multi_steps) else: actions = infer_action_names(df, multi_steps) df = parametric_action_preprocessing( df, actions=actions, multi_steps=multi_steps, include_possible_actions=include_possible_actions, ) df = select_relevant_columns( df, discrete_action=discrete_action, include_possible_actions=include_possible_actions, ) return upload_as_parquet(df)
def upload_as_parquet(df) -> Dataset: """ Generate a random parquet. Fails if cannot generate a non-existent name. """ # get a random tmp name and check if it exists sqlCtx = get_spark_session() success = False for _ in range(MAX_UPLOAD_PARQUET_TRIES): suffix = rand_string(length=UPLOAD_PARQUET_TMP_SUFFIX_LEN) rand_name = f"tmp_parquet_{suffix}" if not sqlCtx.catalog._jcatalog.tableExists(rand_name): success = True break if not success: raise Exception( f"Failed to find name after {MAX_UPLOAD_PARQUET_TRIES} tries.") # perform the write df.write.mode("errorifexists").format("parquet").saveAsTable(rand_name) parquet_url = get_table_url(rand_name) logger.info(f"Saved parquet to {parquet_url}") return Dataset(parquet_url=parquet_url)
def get_table_row_count(parquet_url: str): spark = get_spark_session() return spark.read.parquet(parquet_url).count()