示例#1
0
def test__infer_task():
    # each check is in the same order as in the original implementation
    from ppscore.calculation import _infer_task

    df = pd.read_csv("examples/titanic.csv")

    assert _infer_task(df, "Age", "Age") == "predict_itself"

    df["constant"] = 1
    assert _infer_task(df, "Age", "constant") == "predict_constant"

    assert _infer_task(df, "Age", "Survived") == "classification"

    df = df.reset_index()
    df["id"] = df["index"].astype(str)
    assert _infer_task(df, "Age", "id") == "predict_id"

    # classification because numeric but few categories
    assert _infer_task(df, "Age", "SibSp") == "classification"

    df["Pclass_category"] = df["Pclass"].astype("category")
    assert _infer_task(df, "Age", "Pclass_category") == "classification"

    df["Pclass_datetime"] = pd.to_datetime(df["Pclass"],
                                           infer_datetime_format=True)
    with pytest.raises(Exception):
        pps.score(df, "Age", "Pclass_datetime")

    assert _infer_task(df, "Survived", "Age") == "regression"
示例#2
0
def get_pps_array(df):
    ps=[]
    for i,col1 in enumerate(df.columns):
        ps.append([])
        for col2 in df.columns:
            ps[i].append(pps.score(df,col1,col2)['ppscore'])
    return np.array(ps)
def run_predictive_power_score(df, target, save_path):
    """
    Calculates the predictive power score (pps) for each feature. If the score is 0, then it is not any better than a
    baseline model. If it's 1, then the feature is a perfect predictor. The model_score is the weighted F1 score for
    a univariate model predicting the target.

    :param df: pandas dataframe
    :param target: name of our target
    :param save_path: path in which to save the output
    """
    df = pd.get_dummies(df, dummy_na=True)
    df[target] = df[target].astype(str)
    pps_df = pd.DataFrame({})
    for feature in tqdm(list(df)):
        if feature != target:
            temp_score_dict = pps.score(df, feature, target)
            temp_ppscore = temp_score_dict.get('ppscore')
            temp_model_score = temp_score_dict.get('model_score')
            temp_df = pd.DataFrame({
                'feature': [feature],
                'pps': [temp_ppscore],
                'model_score': [temp_model_score]
            })
            pps_df = pps_df.append(temp_df)
    pps_df.to_csv(os.path.join(save_path, 'predictive_power_score.csv'),
                  index=False)
示例#4
0
def feature_score(df):
    score_check = st.sidebar.checkbox('View PPScore between features ')
    if score_check:
        score_var1 = st.sidebar.selectbox('Select feature', df.columns)
        score_var2 = st.sidebar.selectbox('Select target', df.columns)
        st.markdown('**Prediction of {} using {}**'.format(score_var2, score_var1))
        score = pps.score(df, score_var1, score_var2)
        st.write(score['ppscore'])
示例#5
0
def drop_corr(df, target_column, correlation_percent=0.8, return_cols=False):
    '''
    use predictive power score to choose which of the correlated columns to drop
    '''
    temp_df = df.copy()
    temp_df['target'] = target_column
    pos, _ = get_corr(df, 0.8)
    correlated_columns = pos['columns']
    cols_to_drop = []
    for pair in correlated_columns:
        col_1, col_2 = pair
        score_1 = pps.score(temp_df, col_1, 'target')['ppscore']
        score_2 = pps.score(temp_df, col_2, 'target')['ppscore']
        if score_1 > score_2:
            cols_to_drop.append(col_2)
        else:
            cols_to_drop.append(col_1)

    if return_cols:
        return df.drop(cols_to_drop, axis=1), cols_to_drop
    return df.drop(cols_to_drop, axis=1)
示例#6
0
# - Changing some data types
# - Renaming the column names to be more clear

# %%
df = df[["Survived", "Pclass", "Sex", "Age", "Ticket", "Fare", "Embarked"]]
df = df.rename(columns={"Pclass": "Class"})
df = df.rename(columns={"Ticket": "TicketID"})
df = df.rename(columns={"Fare": "TicketPrice"})
df = df.rename(columns={"Embarked": "Port"})

# %% [markdown]
# ## Single Predictive Power Score
# - Answering the question: how well can Sex predict the Survival probability?

# %%
pps.score(df, "Sex", "Survived")

# %% [markdown]
# ## PPS matrix
# - Answering the question: which predictive patterns exist between the columns?

# %%
matrix = pps.matrix(df)

# %%
matrix

# %%
heatmap(matrix)

# %% [markdown]
示例#7
0
def test_score():
    df = pd.DataFrame()
    df["x"] = np.random.uniform(-2, 2, 1_000)
    df["error"] = np.random.uniform(-0.5, 0.5, 1_000)
    df["y"] = df["x"] * df["x"] + df["error"]

    df["constant"] = 1
    df = df.reset_index()
    df["id"] = df["index"].astype(str)

    df["x_greater_0_boolean"] = df["x"] > 0
    # df["x_greater_0_string"] = df["x_greater_0_boolean"].astype(str)
    df["x_greater_0_string"] = pd.Series(
        df["x_greater_0_boolean"].apply(str), dtype="string"
    )
    df["x_greater_0_string_object"] = df["x_greater_0_string"].astype("object")
    df["x_greater_0_string_category"] = df["x_greater_0_string"].astype("category")

    df["x_greater_0_boolean_object"] = df["x_greater_0_boolean"].astype("object")
    df["x_greater_0_boolean_category"] = df["x_greater_0_boolean"].astype("category")

    df["nan"] = np.nan

    duplicate_column_names_df = pd.DataFrame()
    duplicate_column_names_df["x1"] = np.random.uniform(-2, 2, 10)
    duplicate_column_names_df["x2"] = np.random.uniform(-2, 2, 10)
    duplicate_column_names_df["unique_column_name"] = np.random.uniform(-2, 2, 10)
    duplicate_column_names_df.columns = [
        "duplicate_column_name",
        "duplicate_column_name",
        "unique_column_name",
    ]

    # check input types
    with pytest.raises(TypeError):
        numpy_array = np.random.randn(10, 10)  # not a DataFrame
        pps.score(numpy_array, "x", "y")

    with pytest.raises(ValueError):
        pps.score(df, "x_column_that_does_not_exist", "y")

    with pytest.raises(ValueError):
        pps.score(df, "x", "y_column_that_does_not_exist")

    with pytest.raises(AttributeError):
        # the task argument is not supported any more
        pps.score(df, "x", "y", task="classification")

    with pytest.raises(AssertionError):
        # df shall not have duplicate column names
        pps.score(
            duplicate_column_names_df, "duplicate_column_name", "unique_column_name"
        )

    with pytest.raises(AssertionError):
        # df shall not have duplicate column names
        pps.score(
            duplicate_column_names_df, "unique_column_name", "duplicate_column_name"
        )

    # check cross_validation
    # if more folds than data, there is an error
    with pytest.raises(ValueError):
        assert pps.score(df, "x", "y", cross_validation=2000, catch_errors=False)

    # check random_seed
    assert pps.score(df, "x", "y", random_seed=1) == pps.score(
        df, "x", "y", random_seed=1
    )
    assert pps.score(df, "x", "y", random_seed=1) != pps.score(
        df, "x", "y", random_seed=2
    )
    # the random seed that is drawn automatically is smaller than <1000
    assert pps.score(df, "x", "y") != pps.score(df, "x", "y", random_seed=123_456)

    # check invalid_score
    invalid_score = -99
    assert (
        pps.score(df, "nan", "y", invalid_score=invalid_score)["ppscore"]
        == invalid_score
    )

    # check catch_errors using the cross_validation error from above
    assert pps.score(df, "x", "y", cross_validation=2000, invalid_score=invalid_score, catch_errors=True)["ppscore"] == invalid_score

    # check case discrimination
    assert pps.score(df, "x", "y")["case"] == "regression"
    assert pps.score(df, "x", "x_greater_0_string")["case"] == "classification"
    assert pps.score(df, "x", "constant")["case"] == "target_is_constant"
    assert pps.score(df, "x", "x")["case"] == "predict_itself"
    assert pps.score(df, "x", "id")["case"] == "target_is_id"
    assert pps.score(df, "nan", "y")["case"] == "empty_dataframe_after_dropping_na"

    # check scores
    # feature is id
    assert pps.score(df, "id", "y")["ppscore"] == 0

    # numeric feature and target
    assert pps.score(df, "x", "y")["ppscore"] > 0.5
    assert pps.score(df, "y", "x")["ppscore"] < 0.05

    # boolean feature or target
    assert pps.score(df, "x", "x_greater_0_boolean")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_boolean", "x")["ppscore"] < 0.6

    # string feature or target
    assert pps.score(df, "x", "x_greater_0_string")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_string", "x")["ppscore"] < 0.6

    # object feature or target
    assert pps.score(df, "x", "x_greater_0_string_object")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_string_object", "x")["ppscore"] < 0.6

    # category feature or target
    assert pps.score(df, "x", "x_greater_0_string_category")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_string_category", "x")["ppscore"] < 0.6

    # object feature or target
    assert pps.score(df, "x", "x_greater_0_boolean_object")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_boolean_object", "x")["ppscore"] < 0.6

    # category feature or target
    assert pps.score(df, "x", "x_greater_0_boolean_category")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0_boolean_category", "x")["ppscore"] < 0.6
    return sns.heatmap(df,
                       vmin=0,
                       vmax=1,
                       cmap="Blues",
                       linewidths=0.5,
                       annot=True)


# %%
df = pd.DataFrame()
df["x"] = np.random.uniform(-2, 2, 1_000_000)
df["error"] = np.random.uniform(-0.5, 0.5, 1_000_000)
df["y"] = df["x"] * df["x"] + df["error"]

# %%
sns.scatterplot(x="x", y="y", data=df.sample(10_000))

# %%
matrix = pps.matrix(df)

# %%
matrix

# %%
heatmap(matrix)

# %%
pps.score(df, "x", "y")

# %%
示例#9
0
import ppscore as pps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

data = pd.read_csv('dataset_challenge_one.tsv', delimiter='\t')
data = data.fillna(0)
predictors = data.columns[:-1]
score = []
var = []
for i in predictors:
    score.append(pps.score(data, i, 'class')['ppscore'])
    var.append(i)

ppscore_df = pd.DataFrame({'predictor': var, 'ppscore': score})
ppscore_df = ppscore_df.sort_values(['ppscore'], ascending=False).reset_index()
plt.figure(figsize=(6, 3))
plt.plot(ppscore_df['predictor'].index, ppscore_df['ppscore'])
plt.grid()
plt.xticks(np.arange(0, 1600, 50), rotation='vertical', fontsize=10)
plt.ylabel('ppscore', fontsize=16)
plt.xlabel('Predictor', fontsize=10)
plt.yticks(np.arange(0, 0.3, 0.05), fontsize=10)
plt.title('Predictive Power Score Plot', fontsize=20)
plt.show()

ppscore_df['ppscore'].value_counts()
bins = [
    -1,
示例#10
0
    plinear_data = np.hstack((dat, dat)) + 1
    df = pd.DataFrame(data=plinear_data, columns=["x", "y"])

    enc = KBinsDiscretizer(n_bins=10, encode='ordinal')
    XP_data_nmi = np.squeeze(
        enc.fit_transform(np.atleast_2d(plinear_data[:, 0]).T))
    enc = KBinsDiscretizer(n_bins=10, encode='ordinal')
    YP_data_nmi = np.squeeze(
        enc.fit_transform(np.atleast_2d(plinear_data[:, 1]).T))

    plinear_pc = np.round(
        pearsonr(plinear_data[:, 0], plinear_data[:, 1])[0], 2)
    plinear_nmi = np.round(
        normalized_mutual_info_score(XP_data_nmi, YP_data_nmi), 2)
    plinear_hsic = np.round(ℍ(plinear_data[:, 0], plinear_data[:, 1]), 2)
    plinear_pps = np.round(pps.score(df, "x", "y")['ppscore'], 2)

    print('Linear Relationship:')
    print('\tCorrelation : ', plinear_pc)
    print('\tNMI : ', plinear_nmi)
    print('\tpps : ', plinear_pps)
    print('\tHSIC : ', plinear_hsic)

    #	Linear Data
    dat = np.random.rand(n, 1)
    linear_data = np.hstack((dat, dat)) + 0.04 * np.random.randn(n, 2)
    df = pd.DataFrame(data=linear_data, columns=["x", "y"])

    enc = KBinsDiscretizer(n_bins=10, encode='ordinal')
    XL_data_nmi = np.squeeze(
        enc.fit_transform(np.atleast_2d(linear_data[:, 0]).T))
示例#11
0
 def predictive_power_score(self, df, feature_col_name, target_col_name):
     df = df.dropna(subset=[feature_col_name, target_col_name])
     p = pps.score(df, feature_col_name, target_col_name)
     return p
示例#12
0
def test_score():
    df = pd.DataFrame()
    df["x"] = np.random.uniform(-2, 2, 1_000)
    df["error"] = np.random.uniform(-0.5, 0.5, 1_000)
    df["y"] = df["x"] * df["x"] + df["error"]

    df["constant"] = 1
    df = df.reset_index()
    df["id"] = df["index"].astype(str)

    df["x_greater_0"] = df["x"] > 0
    df["x_greater_0"] = df["x_greater_0"].astype(str)

    df["nan"] = np.nan
    with pytest.raises(Exception):
        pps.score(df, "nan", "y")

    assert pps.score(df, "x", "y", "regression")["task"] == "regression"

    assert pps.score(df, "x", "constant")["task"] == "predict_constant"
    assert pps.score(df, "x", "x")["task"] == "predict_itself"
    assert pps.score(df, "x", "id")["task"] == "predict_id"

    # feature is id
    assert pps.score(df, "id", "y")["ppscore"] == 0

    # numeric feature and target
    assert pps.score(df, "x", "y")["ppscore"] > 0.5
    assert pps.score(df, "y", "x")["ppscore"] < 0.05

    # object feature or target
    assert pps.score(df, "x", "x_greater_0")["ppscore"] > 0.6
    assert pps.score(df, "x_greater_0", "x")["ppscore"] < 0.6