def fit_transform(self, df):
        '''

		:param df:
		:return:
		'''
        y = pd.get_dummies(df[self.target])
        df = df.drop(self.target, axis=1)
        m, n = df.shape
        cnt = 0
        df_index = df.copy()
        df_value = df.copy()
        df = pd.get_dummies(df, columns=self.categorical)
        self.columns = df.columns
        # 离散变量的值全部转化为1
        for name in self.categorical:
            df_value[name] = np.ones(m)
        for name in self.numerical:
            df_value[name] = minmax_scale(df_value[name])
        for i in range(len(self.columns)):
            cnt += 1
            col_name = df.columns[i]
            # 连续变量
            if '_' not in col_name:
                df_index[col_name] = np.array([i for j in range(m)])
            else:
                # 离散变量
                col = col_name.split('_')[0]
                for j in df.index:
                    ###根据index取得行数据,解决train_test_split之后出现Nan值的bug
                    if df.loc[j, col_name] == 1:
                        df_index.at[j, col] = i
        self.cnt = cnt
        return df_index, df_value, y, cnt
예제 #2
0
def test_get_dummies(request, data, dummy_na, drop_first):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    try:
        pandas_result = pandas.get_dummies(pandas_df,
                                           dummy_na=dummy_na,
                                           drop_first=drop_first)
    except Exception as e:
        with pytest.raises(type(e)):
            pd.get_dummies(modin_df, dummy_na=dummy_na, drop_first=drop_first)
    else:
        modin_result = pd.get_dummies(modin_df,
                                      dummy_na=dummy_na,
                                      drop_first=drop_first)
        df_equals(modin_result, pandas_result)
def processing_feature(df, target=None, categorical=None, numerical=None):
    '''
	处理数据
	:param df:
	:param target: 预测值
	:param categorical: [list],离散变量的列名
	:param numerical: [list],连续变量的列名
	:return:df_index(数据在one-hot编码之后的索引),df_value(数据的值,连续型变量为原值,离散变量为1),
	'''
    # 分离目标变量
    if target:
        # y = df[target]
        # y = y.values.reshape(len(y), 1)#转换y的shape
        y = pd.get_dummies(df[target])
        df = df.drop(target, axis=1)
    else:
        y = np.array(np.zero(df.shape[0])).astype('float32')
        y = y.reshape(len(y), 1)
    m, n = df.shape
    cnt = 0
    df_index = df.copy()
    df_value = df.copy()
    df = pd.get_dummies(df, columns=categorical)
    # 离散变量的值全部转化为1
    for name in categorical:
        df_value[name] = np.ones(m)
    for name in numerical:
        df_value[name] = minmax_scale(df_value[name])
    for i in range(len(df.columns)):
        cnt += 1
        col_name = df.columns[i]
        #连续变量
        if '_' not in col_name:
            df_index[col_name] = np.array([i for j in range(m)])
        else:
            # 离散变量
            col = col_name.split('_')[0]
            for j in df.index:
                ###根据index取得行数据,解决train_test_split之后出现Nan值的bug
                if df.loc[j, col_name] == 1:
                    df_index.at[j, col] = i
    return df_index, df_value, y, cnt
    def transform(self, df):
        '''

		:param df:
		:return:
		'''
        if len(self.columns) == 0:
            print('please fit first')
        if self.target in df.columns:
            y = pd.get_dummies(df[self.target])
            df = df.drop(self.target, axis=1)
        else:
            y = np.array(np.zero(df.shape[0])).astype('float32')
            y = y.reshape(len(y), 1)
        m, n = df.shape
        df_index = df.copy()
        df_value = df.copy()
        df = pd.get_dummies(df, columns=self.categorical)
        # 离散变量的值全部转化为1
        for name in self.categorical:
            df_value[name] = np.ones(m)
        for name in self.numerical:
            df_value[name] = minmax_scale(df_value[name])
        for i in range(len(self.columns)):
            if self.columns[i] not in df.columns:
                continue
            col_name = self.columns[i]
            # 连续变量
            if '_' not in col_name:
                df_index[col_name] = np.array([i for j in range(m)])
            else:
                # 离散变量
                col = col_name.split('_')[0]
                for j in df.index:
                    ###根据index取得行数据,解决train_test_split之后出现Nan值的bug
                    if df.loc[j, col_name] == 1:
                        df_index.at[j, col] = i
        return df_index, df_value, y, self.cnt
예제 #5
0
def multi_weighted_logloss(y_true, y_preds, classes, class_weights):
    """
    refactor from
    @author olivier https://www.kaggle.com/ogrellier
    multi logloss for PLAsTiCC challenge
    """
    y_p = y_preds.reshape(y_true.shape[0], len(classes), order="F")
    y_ohe = pd.get_dummies(y_true)
    y_p = np.clip(a=y_p, a_min=1e-15, a_max=1 - 1e-15)
    y_p_log = np.log(y_p)
    y_log_ones = np.sum(y_ohe.values * y_p_log, axis=0)
    nb_pos = y_ohe.sum(axis=0).values.astype(float)
    class_arr = np.array([class_weights[k] for k in sorted(class_weights.keys())])
    y_w = y_log_ones * class_arr / nb_pos

    loss = -np.sum(y_w) / np.sum(class_arr)
    return loss
예제 #6
0
    )
)
matplotlib.rcParams["figure.figsize"] = (12.0, 6.0)
prices = pd.DataFrame(
    {"price": train["SalePrice"], "log(price + 1)": np.log1p(train["SalePrice"])}
)
prices.hist()
train["SalePrice"] = np.log1p(train["SalePrice"])
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(
    lambda x: skew(x.dropna())
)  # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[: train.shape[0]]
X_test = all_data[train.shape[0] :]
y = train.SalePrice
from sklearn.linear_model import Ridge, LassoCV  # RidgeCV, ElasticNet, LassoLarsCV
from sklearn.model_selection import cross_val_score


def rmse_cv(model):
    rmse = np.sqrt(
        -cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv=5)
    )
    return rmse

예제 #7
0
dataset["Fsize"] = dataset["SibSp"] + dataset["Parch"] + 1
g = sns.factorplot(x="Fsize", y="Survived", data=dataset)
g = g.set_ylabels("Survival Probability")
dataset["Single"] = dataset["Fsize"].map(lambda s: 1 if s == 1 else 0)
dataset["SmallF"] = dataset["Fsize"].map(lambda s: 1 if s == 2 else 0)
dataset["MedF"] = dataset["Fsize"].map(lambda s: 1 if 3 <= s <= 4 else 0)
dataset["LargeF"] = dataset["Fsize"].map(lambda s: 1 if s >= 5 else 0)
g = sns.factorplot(x="Single", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="SmallF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="MedF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
g = sns.factorplot(x="LargeF", y="Survived", data=dataset, kind="bar")
g = g.set_ylabels("Survival Probability")
dataset = pd.get_dummies(dataset, columns=["Title"])
dataset = pd.get_dummies(dataset, columns=["Embarked"], prefix="Em")
dataset.head()
dataset["Cabin"].head()
dataset["Cabin"].describe()
dataset["Cabin"].isnull().sum()
dataset["Cabin"][dataset["Cabin"].notnull()].head()
dataset["Cabin"] = pd.Series(
    [i[0] if not pd.isnull(i) else "X" for i in dataset["Cabin"]])
g = sns.countplot(dataset["Cabin"],
                  order=["A", "B", "C", "D", "E", "F", "G", "T", "X"])
g = sns.factorplot(
    y="Survived",
    x="Cabin",
    data=dataset,
    kind="bar",
예제 #8
0
def test_get_dummies():
    s = pd.Series(list("abca"))
    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s)

    s1 = ["a", "b", np.nan]
    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s1)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s1, dummy_na=True)

    data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]}
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    modin_result = pd.get_dummies(modin_df, prefix=["col1", "col2"])
    pandas_result = pandas.get_dummies(pandas_df, prefix=["col1", "col2"])
    df_equals(modin_result, pandas_result)
    assert modin_result._to_pandas().columns.equals(pandas_result.columns)
    assert modin_result.shape == pandas_result.shape

    modin_result = pd.get_dummies(pd.DataFrame(pd.Series(list("abcdeabac"))))
    pandas_result = pandas.get_dummies(
        pandas.DataFrame(pandas.Series(list("abcdeabac"))))
    df_equals(modin_result, pandas_result)
    assert modin_result._to_pandas().columns.equals(pandas_result.columns)
    assert modin_result.shape == pandas_result.shape

    with pytest.raises(NotImplementedError):
        pd.get_dummies(modin_df, prefix=["col1", "col2"], sparse=True)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abcaa")))

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abc")), dtype=float)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(1)
예제 #9
0
def model(features, test_features, encoding="ohe", n_folds=5):
    test_ids = test_features["SK_ID_CURR"]
    labels = features["TARGET"]
    features = features.drop(columns=["SK_ID_CURR", "TARGET"])
    test_features = test_features.drop(columns=["SK_ID_CURR"])
    if encoding == "ohe":
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        features, test_features = features.align(test_features,
                                                 join="inner",
                                                 axis=1)
        cat_indices = "auto"
    elif encoding == "le":
        label_encoder = LabelEncoder()
        cat_indices = []
        for i, col in enumerate(features):
            if features[col].dtype == "object":
                features[col] = label_encoder.fit_transform(
                    np.array(features[col].astype(str)).reshape((-1, )))
                test_features[col] = label_encoder.transform(
                    np.array(test_features[col].astype(str)).reshape((-1, )))
                cat_indices.append(i)
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
    print("Training Data Shape: ", features.shape)
    print("Testing Data Shape: ", test_features.shape)
    feature_names = list(features.columns)
    features = np.array(features)
    test_features = np.array(test_features)
    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50)
    feature_importance_values = np.zeros(len(feature_names))
    test_predictions = np.zeros(test_features.shape[0])
    out_of_fold = np.zeros(features.shape[0])
    valid_scores = []
    train_scores = []
    for train_indices, valid_indices in k_fold.split(features):
        train_features, train_labels = features[train_indices], labels[
            train_indices]
        valid_features, valid_labels = features[valid_indices], labels[
            valid_indices]
        model = lgb.LGBMClassifier(
            n_estimators=10000,
            objective="binary",
            class_weight="balanced",
            learning_rate=0.05,
            reg_alpha=0.1,
            reg_lambda=0.1,
            subsample=0.8,
            n_jobs=-1,
            random_state=50,
        )
        model.fit(
            train_features,
            train_labels,
            eval_metric="auc",
            eval_set=[(valid_features, valid_labels),
                      (train_features, train_labels)],
            eval_names=["valid", "train"],
            categorical_feature=cat_indices,
            early_stopping_rounds=100,
            verbose=200,
        )
        best_iteration = model.best_iteration_
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        test_predictions += (model.predict_proba(
            test_features, num_iteration=best_iteration)[:, 1] /
                             k_fold.n_splits)
        out_of_fold[valid_indices] = model.predict_proba(
            valid_features, num_iteration=best_iteration)[:, 1]
        valid_score = model.best_score_["valid"]["auc"]
        train_score = model.best_score_["train"]["auc"]
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
    submission = pd.DataFrame({
        "SK_ID_CURR": test_ids,
        "TARGET": test_predictions
    })
    feature_importances = pd.DataFrame({
        "feature": feature_names,
        "importance": feature_importance_values
    })
    valid_auc = roc_auc_score(labels, out_of_fold)
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    fold_names = list(range(n_folds))
    fold_names.append("overall")
    metrics = pd.DataFrame({
        "fold": fold_names,
        "train": train_scores,
        "valid": valid_scores
    })
    return submission, feature_importances, metrics
예제 #10
0
app_train.dtypes.value_counts()
app_train.select_dtypes("object").apply(pd.Series.nunique, axis=0)
le = LabelEncoder()
le_count = 0
for col in app_train:
    if app_train[col].dtype == "object":
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            le_count += 1
print("%d columns were label encoded." % le_count)
app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)
print("Training Features shape: ", app_train.shape)
print("Testing Features shape: ", app_test.shape)
train_labels = app_train["TARGET"]
app_train, app_test = app_train.align(app_test, join="inner", axis=1)
app_train["TARGET"] = train_labels
print("Training Features shape: ", app_train.shape)
print("Testing Features shape: ", app_test.shape)
(app_train["DAYS_BIRTH"] / -365).describe()
app_train["DAYS_EMPLOYED"].describe()
app_train["DAYS_EMPLOYED"].plot.hist(title="Days Employment Histogram")
plt.xlabel("Days Employment")
anom = app_train[app_train["DAYS_EMPLOYED"] == 3]
non_anom = app_train[app_train["DAYS_EMPLOYED"] != 3]
print("The non-anomalies default on %0.2f%% of loans" %
예제 #11
0
knn_cv = GridSearchCV(knn, grid, cv=3)  # GridSearchCV
knn_cv.fit(x, y)  # Fit
print("Tuned hyperparameter k: {}".format(knn_cv.best_params_))
print("Best score: {}".format(knn_cv.best_score_))
param_grid = {"C": np.logspace(-3, 3, 7), "penalty": ["l1", "l2"]}
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=12)
logreg = LogisticRegression()
logreg_cv = GridSearchCV(logreg, param_grid, cv=3)
logreg_cv.fit(x_train, y_train)
print("Tuned hyperparameters : {}".format(logreg_cv.best_params_))
print("Best Accuracy: {}".format(logreg_cv.best_score_))
data = pd.read_csv("column_2C_weka.csv")
df = pd.get_dummies(data)
df.head(10)
df.drop("class_Normal", axis=1, inplace=True)
df.head(10)
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

steps = [("scalar", StandardScaler()), ("SVM", SVC())]
pipeline = Pipeline(steps)
parameters = {"SVM__C": [1, 10, 100], "SVM__gamma": [0.1, 0.01]}
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)
cv = GridSearchCV(pipeline, param_grid=parameters, cv=3)
예제 #12
0
    def test_modin(self):

        df = md.DataFrame([[1, 2., True], [2, 3., False]],
                          columns=['a', 'b', 'c'])
        dm = xgb.DMatrix(df, label=md.Series([1, 2]))
        assert dm.feature_names == ['a', 'b', 'c']
        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
        np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))

        # overwrite feature_names and feature_types
        dm = xgb.DMatrix(df,
                         label=md.Series([1, 2]),
                         feature_names=['x', 'y', 'z'],
                         feature_types=['q', 'q', 'q'])
        assert dm.feature_names == ['x', 'y', 'z']
        assert dm.feature_types == ['q', 'q', 'q']
        assert dm.num_row() == 2
        assert dm.num_col() == 3

        # incorrect dtypes
        df = md.DataFrame([[1, 2., 'x'], [2, 3., 'y']],
                          columns=['a', 'b', 'c'])
        self.assertRaises(ValueError, xgb.DMatrix, df)

        # numeric columns
        df = md.DataFrame([[1, 2., True], [2, 3., False]])
        dm = xgb.DMatrix(df, label=md.Series([1, 2]))
        assert dm.feature_names == ['0', '1', '2']
        assert dm.feature_types == ['int', 'float', 'i']
        assert dm.num_row() == 2
        assert dm.num_col() == 3
        np.testing.assert_array_equal(dm.get_label(), np.array([1, 2]))

        df = md.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
        dm = xgb.DMatrix(df, label=md.Series([1, 2]))
        assert dm.feature_names == ['4', '5', '6']
        assert dm.feature_types == ['int', 'float', 'int']
        assert dm.num_row() == 2
        assert dm.num_col() == 3

        df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
        dummies = md.get_dummies(df)
        #    B  A_X  A_Y  A_Z
        # 0  1    1    0    0
        # 1  2    0    1    0
        # 2  3    0    0    1
        result, _, _ = xgb.data._transform_pandas_df(dummies)
        exp = np.array([[1., 1., 0., 0.], [2., 0., 1., 0.], [3., 0., 0., 1.]])
        np.testing.assert_array_equal(result, exp)
        dm = xgb.DMatrix(dummies)
        assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
        assert dm.feature_types == ['int', 'int', 'int', 'int']
        assert dm.num_row() == 3
        assert dm.num_col() == 4

        df = md.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]})
        dm = xgb.DMatrix(df)
        assert dm.feature_names == ['A=1', 'A=2']
        assert dm.feature_types == ['int', 'int']
        assert dm.num_row() == 3
        assert dm.num_col() == 2

        df_int = md.DataFrame([[1, 1.1], [2, 2.2]], columns=[9, 10])
        dm_int = xgb.DMatrix(df_int)
        df_range = md.DataFrame([[1, 1.1], [2, 2.2]], columns=range(9, 11, 1))
        dm_range = xgb.DMatrix(df_range)
        assert dm_int.feature_names == ['9', '10']  # assert not "9 "
        assert dm_int.feature_names == dm_range.feature_names

        # test MultiIndex as columns
        df = md.DataFrame([(1, 2, 3, 4, 5, 6), (6, 5, 4, 3, 2, 1)],
                          columns=md.MultiIndex.from_tuples((
                              ('a', 1),
                              ('a', 2),
                              ('a', 3),
                              ('b', 1),
                              ('b', 2),
                              ('b', 3),
                          )))
        dm = xgb.DMatrix(df)
        assert dm.feature_names == ['a 1', 'a 2', 'a 3', 'b 1', 'b 2', 'b 3']
        assert dm.feature_types == ['int', 'int', 'int', 'int', 'int', 'int']
        assert dm.num_row() == 2
        assert dm.num_col() == 6
예제 #13
0
df34 = df4[top10HCA]
df44 = df4[top10HD]

# Lets visualize those datasets to ensure we are good to move forward.
df34.info()
df44.info()

# Now lets look at the medians an means of each of the features as they pertain to the clusters to get an idea of what they may mean. We can then try to get a logistic regression and multinomial logistic regression done to see if it adds to anything we may need to or want to know.
df_medianHCA = (df34.loc[df34.hca >= 0, :].groupby(df34.hca).median())
df_meanHCA = (df34.loc[df34.hca >= 0, :].groupby(df34.hca).mean())
pd.set_option('display.max_columns', None)
print(df_medianHCA)
print(df_meanHCA)

# After seeing that it looks like we have a pretty good idea of what our clusters may mean but lets see if we can get any insight from a correlation matrix. Lets split the clusters up to get a detailed idea about all of them separately.
df32 = pd.get_dummies(df34['hca'], prefix = "hca")
df32.info()
df72 = pd.concat([df32, df34], axis=1)
df72.info()
df82 = df72.drop('hca', axis=1)
df82.info()

# Lets take a look at a correlation matrix to help us out here since we did not have much luck with the logits for HCA.
hcacorr = df82.corr()
print(hcacorr)
fig, ax = plt.subplots(figsize=(50,50))
sns.heatmap(df82.corr(), annot=True)
ax.tick_params(axis='both', which='major', labelsize=20)
ax.tick_params(axis='both', which='minor', labelsize=20)

# We can see that the logit excluded the first cluster as a reference cluster to the other and did not incluse the resultsin the model. For that reason we will base out inferences on the existing cluster coeficient, but further, on the medians we took above on each feature as they pertain to each cluster to get a more full picture of what everything means.
예제 #14
0
missing_value_df5

# Since our data is real, and what is missing is personal data that was not collected, we do not want to impute or replace given that these may skew the results and harm the integrity of the data. We will instead, drop all of the patients with missing values.
df6 = df5.dropna()
df6.info()

# Now lets look at the distribution of the target feature to see if we may need to do anything with the data.
sns.distplot(df6['BLDPGVN'])

# It does look like we have an imbalanced target, this will dictate the eval metric and models we use. We will want to use AUC and F1 score for a classification problem. We are going to use models that can handle mixed data as well as we are going to onehot encode to get a more accurate look at the prediction numbers. Next we will be onehot encoding a few features to include code level, EDMAP, EDDBP, EDSBP, EDHR, and GCS to get more understanding around the values. We will be label encoding the cause codes, and the transport agencies as well.

# Lets create the dataframes we will need to do this.
df66 =['CODE_LEVEL','ED_MAP1', 'ED_DBP2', 'ED_SBP2', 'ED_HR2', 'ED_GCS1', 'CAUSE_CODE']

# Onehot encode first.
df7 = pd.get_dummies(df6, prefix_sep='_', drop_first=False, columns = df66)
df7.info()

# Label encode next.
le = LabelEncoder()
df7['TRANS_AGENCY'] = le.fit_transform(df7['TRANS_AGENCY'])
df7['TRANS_AGENCY'].head(10)

# Now that we have processed our labels and made everything numeric we can proceed with creating our test and train set and modeling. We will be using pafit for our parameter tuning because we have a time element, which will cause us to split the data on that time element for prediction after sorting first. We will then create a train-validation-test set from the data so that we can effectively predict for future cases. We will use a RF, XGBoost, and Logistic Regression model to look for the best prediction model. Then we will look at our important features and do a statistical logistic regression to better understand the relationship of the features that are important with the target feature. The target feature will be the receipt of blood or not. We want to predict this to better prepare for instances of when we will need it and to better predict the need of blood products. This will help with the blood bank and knowing when we may need to start calling for more blood early.

df7.head(10)

# Lets look at some general and descriptive stats.

# Lets look at what what the length of stay may be for a patient given blood. We can also look at the icu total amount of days of someone given blood and he hospital arrival dates for those given blood to see things over time.
plt.figure(figsize=(6, 12))