예제 #1
0
def identify_normalization_parameters(
    table_spec: TableSpec,
    column_name: str,
    preprocessing_options: PreprocessingOptions,
    seed: Optional[int] = None,
) -> Dict[int, NormalizationParameters]:
    """ Get normalization parameters """
    sqlCtx = get_spark_session()
    df = sqlCtx.sql(f"SELECT * FROM {table_spec.table_name}")
    df = create_normalization_spec_spark(df, column_name,
                                         preprocessing_options.num_samples,
                                         seed)
    rows = df.collect()

    normalization_processor = normalization_helper(
        max_unique_enum_values=preprocessing_options.max_unique_enum_values,
        quantile_size=preprocessing_options.quantile_size,
        quantile_k2_threshold=preprocessing_options.quantile_k2_threshold,
        skip_box_cox=preprocessing_options.skip_box_cox,
        skip_quantiles=preprocessing_options.skip_quantiles,
        feature_overrides=preprocessing_options.feature_overrides,
        whitelist_features=preprocessing_options.whitelist_features,
        assert_whitelist_feature_coverage=preprocessing_options.
        assert_whitelist_feature_coverage,
    )
    return normalization_processor(rows)
예제 #2
0
def timeline_operator(pkl_path: str, input_table_spec: TableSpec):
    """ Loads a pandas parquet, converts to pyspark, and uploads df to Hive.
        Then call the timeline operator.
    """

    pd_df = pd.read_pickle(pkl_path)
    spark = get_spark_session()
    df = spark.createDataFrame(pd_df)
    input_name = f"{input_table_spec.table_name}{PRE_TIMELINE_SUFFIX}"
    df.createTempView(input_name)

    output_name = input_table_spec.table_name
    include_possible_actions = "possible_actions" in pd_df
    arg = {
        "startDs": "2019-01-01",
        "endDs": "2019-01-01",
        "addTerminalStateRow": True,
        "inputTableName": input_name,
        "outputTableName": output_name,
        "includePossibleActions": include_possible_actions,
        "percentileFunction": "percentile_approx",
        "rewardColumns": ["reward", "metrics"],
        "extraFeatureColumns": [],
    }
    call_spark_class(spark, class_name="Timeline", args=json.dumps(arg))
예제 #3
0
def upload_to_hive(pkl_path: str, input_table_spec: TableSpec):
    """ Loads a pandas parquet, converts to pyspark, and uploads df to Hive. """
    pd_df = pd.read_pickle(pkl_path)
    spark = get_spark_session()
    df = spark.createDataFrame(pd_df)
    tbl_name = f"{input_table_spec.table_name}{PRE_TIMELINE_SUFFIX}"
    df.write.mode("overwrite").saveAsTable(tbl_name)
예제 #4
0
def query_data(
    input_table_spec: TableSpec,
    actions: List[str],
    custom_reward_expression: Optional[str] = None,
    sample_range: Optional[Tuple[float, float]] = None,
    multi_steps: Optional[int] = None,
    gamma: Optional[float] = None,
) -> Dataset:
    """ Perform reward calculation, hashing mdp + subsampling and
    other preprocessing such as sparse2dense.
    """
    sqlCtx = get_spark_session()
    df = sqlCtx.sql(f"SELECT * FROM {input_table_spec.table_name}")
    states = infer_states_names(df, multi_steps)
    metrics = infer_metrics_names(df, multi_steps)
    df = set_reward_col_as_reward(
        df,
        custom_reward_expression=custom_reward_expression,
        multi_steps=multi_steps,
        gamma=gamma,
    )
    df = hash_mdp_id_and_subsample(df, sample_range=sample_range)
    df = perform_preprocessing(
        df, states=states, actions=actions, metrics=metrics, multi_steps=multi_steps
    )
    df = select_relevant_columns(df)
    df.write.mode("overwrite").parquet(input_table_spec.output_dataset.parquet_url)
    return input_table_spec.output_dataset
예제 #5
0
def calc_custom_reward(df, custom_reward_expression: str):
    sqlCtx = get_spark_session()
    # create a temporary table for running sql
    temp_table_name = "_tmp_calc_reward_df"
    temp_reward_name = "_tmp_reward_col"
    df.createOrReplaceTempView(temp_table_name)
    df = sqlCtx.sql(
        f"SELECT *, CAST(COALESCE({custom_reward_expression}, 0) AS FLOAT)"
        f" as {temp_reward_name} FROM {temp_table_name}")
    return df.drop("reward").withColumnRenamed(temp_reward_name, "reward")
예제 #6
0
def query_data(
    input_table_spec: TableSpec,
    discrete_action: bool,
    actions: Optional[List[str]] = None,
    include_possible_actions=True,
    custom_reward_expression: Optional[str] = None,
    sample_range: Optional[Tuple[float, float]] = None,
    multi_steps: Optional[int] = None,
    gamma: Optional[float] = None,
) -> Dataset:
    """ Perform reward calculation, hashing mdp + subsampling and
    other preprocessing such as sparse2dense.
    """
    sqlCtx = get_spark_session()
    # pyre-fixme[16]: `TableSpec` has no attribute `table_name`.
    # pyre-fixme[16]: `TableSpec` has no attribute `table_name`.
    df = sqlCtx.sql(f"SELECT * FROM {input_table_spec.table_name}")
    df = set_reward_col_as_reward(
        df,
        custom_reward_expression=custom_reward_expression,
        multi_steps=multi_steps,
        gamma=gamma,
    )
    df = hash_mdp_id_and_subsample(df, sample_range=sample_range)
    df = misc_column_preprocessing(df, multi_steps=multi_steps)
    df = state_and_metrics_sparse2dense(
        df,
        states=infer_states_names(df, multi_steps),
        metrics=infer_metrics_names(df, multi_steps),
        multi_steps=multi_steps,
    )
    if discrete_action:
        assert include_possible_actions
        assert actions is not None, "in discrete case, actions must be given."
        df = discrete_action_preprocessing(df,
                                           actions=actions,
                                           multi_steps=multi_steps)
    else:
        actions = infer_action_names(df, multi_steps)
        df = parametric_action_preprocessing(
            df,
            actions=actions,
            multi_steps=multi_steps,
            include_possible_actions=include_possible_actions,
        )

    df = select_relevant_columns(
        df,
        discrete_action=discrete_action,
        include_possible_actions=include_possible_actions,
    )
    return upload_as_parquet(df)
예제 #7
0
def timeline_operator(input_table_spec: TableSpec):
    """ Call the timeline operator. """
    input_name = f"{input_table_spec.table_name}{PRE_TIMELINE_SUFFIX}"
    output_name = input_table_spec.table_name
    arg = {
        "startDs": "2019-01-01",
        "endDs": "2019-01-01",
        "addTerminalStateRow": True,
        "inputTableName": input_name,
        "outputTableName": output_name,
        "includePossibleActions": True,
        "percentileFunction": "percentile_approx",
        "rewardColumns": ["reward", "metrics"],
        "extraFeatureColumns": [],
    }
    input_json = json.dumps(arg)
    spark = get_spark_session()
    spark._jvm.com.facebook.spark.rl.Timeline.main(input_json)
예제 #8
0
def upload_as_parquet(df) -> Dataset:
    """ Generate a random parquet. Fails if cannot generate a non-existent name. """

    # get a random tmp name and check if it exists
    sqlCtx = get_spark_session()
    success = False
    for _ in range(MAX_UPLOAD_PARQUET_TRIES):
        suffix = rand_string(length=UPLOAD_PARQUET_TMP_SUFFIX_LEN)
        rand_name = f"tmp_parquet_{suffix}"
        if not sqlCtx.catalog._jcatalog.tableExists(rand_name):
            success = True
            break
    if not success:
        raise Exception(f"Failed to find name after {MAX_UPLOAD_PARQUET_TRIES} tries.")

    # perform the write
    df.write.mode("errorifexists").format("parquet").saveAsTable(rand_name)
    parquet_url = get_table_url(rand_name)
    logger.info(f"Saved parquet to {parquet_url}")
    return Dataset(parquet_url=parquet_url)
예제 #9
0
def get_table_row_count(parquet_url: str):
    spark = get_spark_session()
    return spark.read.parquet(parquet_url).count()