示例#1
0
def timeline_operator(pkl_path: str, input_table_spec: TableSpec):
    """Loads a pandas parquet, converts to pyspark, and uploads df to Hive.
    Then call the timeline operator.
    """

    pd_df = pd.read_pickle(pkl_path)
    spark = get_spark_session()
    df = spark.createDataFrame(pd_df)
    input_name = f"{input_table_spec.table_name}{PRE_TIMELINE_SUFFIX}"
    df.createTempView(input_name)

    output_name = input_table_spec.table_name
    include_possible_actions = "possible_actions" in pd_df
    arg = {
        "startDs": "2019-01-01",
        "endDs": "2019-01-01",
        "addTerminalStateRow": True,
        "inputTableName": input_name,
        "outputTableName": output_name,
        "includePossibleActions": include_possible_actions,
        "percentileFunction": "percentile_approx",
        "rewardColumns": ["reward", "metrics"],
        "extraFeatureColumns": [],
    }
    call_spark_class(spark, class_name="Timeline", args=json.dumps(arg))
def identify_normalization_parameters(
    table_spec: TableSpec,
    column_name: str,
    preprocessing_options: PreprocessingOptions,
    seed: Optional[int] = None,
) -> Dict[int, NormalizationParameters]:
    """Get normalization parameters"""
    sqlCtx = get_spark_session()
    df = sqlCtx.sql(f"SELECT * FROM {table_spec.table_name}")
    df = create_normalization_spec_spark(df, column_name,
                                         preprocessing_options.num_samples,
                                         seed)
    rows = df.collect()

    normalization_processor = normalization_helper(
        max_unique_enum_values=preprocessing_options.max_unique_enum_values,
        quantile_size=preprocessing_options.quantile_size,
        quantile_k2_threshold=preprocessing_options.quantile_k2_threshold,
        skip_box_cox=preprocessing_options.skip_box_cox,
        skip_quantiles=preprocessing_options.skip_quantiles,
        feature_overrides=preprocessing_options.feature_overrides,
        allowedlist_features=preprocessing_options.allowedlist_features,
        assert_allowedlist_feature_coverage=preprocessing_options.
        assert_allowedlist_feature_coverage,
    )
    return normalization_processor(rows)
示例#3
0
def calc_custom_reward(df, custom_reward_expression: str):
    sqlCtx = get_spark_session()
    # create a temporary table for running sql
    temp_table_name = "_tmp_calc_reward_df"
    temp_reward_name = "_tmp_reward_col"
    df.createOrReplaceTempView(temp_table_name)
    df = sqlCtx.sql(
        f"SELECT *, CAST(COALESCE({custom_reward_expression}, 0) AS FLOAT)"
        f" as {temp_reward_name} FROM {temp_table_name}")
    return df.drop("reward").withColumnRenamed(temp_reward_name, "reward")
示例#4
0
    def query_data(
        self,
        input_table_spec: TableSpec,
        discrete_action: bool,
        actions: Optional[List[str]] = None,
        include_possible_actions=True,
        custom_reward_expression: Optional[str] = None,
        sample_range: Optional[Tuple[float, float]] = None,
        multi_steps: Optional[int] = None,
        gamma: Optional[float] = None,
    ) -> Dataset:
        """Perform reward calculation, hashing mdp + subsampling and
        other preprocessing such as sparse2dense.
        """
        sqlCtx = get_spark_session()
        # pyre-ignore
        df = sqlCtx.sql(f"SELECT * FROM {input_table_spec.table_name}")
        df = set_reward_col_as_reward(
            df,
            custom_reward_expression=custom_reward_expression,
            multi_steps=multi_steps,
            gamma=gamma,
        )
        df = hash_mdp_id_and_subsample(df, sample_range=sample_range)
        df = misc_column_preprocessing(df, multi_steps=multi_steps)
        df = state_and_metrics_sparse2dense(
            df,
            states=infer_states_names(df, multi_steps),
            metrics=infer_metrics_names(df, multi_steps),
            multi_steps=multi_steps,
        )
        if discrete_action:
            assert include_possible_actions
            assert actions is not None, "in discrete case, actions must be given."
            df = discrete_action_preprocessing(df,
                                               actions=actions,
                                               multi_steps=multi_steps)
        else:
            actions = infer_action_names(df, multi_steps)
            df = parametric_action_preprocessing(
                df,
                actions=actions,
                multi_steps=multi_steps,
                include_possible_actions=include_possible_actions,
            )

        df = select_relevant_columns(
            df,
            discrete_action=discrete_action,
            include_possible_actions=include_possible_actions,
        )
        return upload_as_parquet(df)
示例#5
0
def upload_as_parquet(df) -> Dataset:
    """ Generate a random parquet. Fails if cannot generate a non-existent name. """

    # get a random tmp name and check if it exists
    sqlCtx = get_spark_session()
    success = False
    for _ in range(MAX_UPLOAD_PARQUET_TRIES):
        suffix = rand_string(length=UPLOAD_PARQUET_TMP_SUFFIX_LEN)
        rand_name = f"tmp_parquet_{suffix}"
        if not sqlCtx.catalog._jcatalog.tableExists(rand_name):
            success = True
            break
    if not success:
        raise Exception(
            f"Failed to find name after {MAX_UPLOAD_PARQUET_TRIES} tries.")

    # perform the write
    df.write.mode("errorifexists").format("parquet").saveAsTable(rand_name)
    parquet_url = get_table_url(rand_name)
    logger.info(f"Saved parquet to {parquet_url}")
    return Dataset(parquet_url=parquet_url)
示例#6
0
def get_table_row_count(parquet_url: str):
    spark = get_spark_session()
    return spark.read.parquet(parquet_url).count()