示例#1
0
def load_train_test(nrows = None, silent = True, treat_cat_missing = False, treat_num_missing = False, remove_duplicated_cols = False):
    train, test = pp.read_train_test(nrows = nrows)
    
    test["first_active_month"].fillna(test["first_active_month"].mode().iloc[0], inplace=True)
    
    train['first_active_month'] =  pd.to_datetime(train['first_active_month'], format='%Y-%m-%d')
    test['first_active_month'] =  pd.to_datetime(test['first_active_month'], format='%Y-%m-%d')
    
    train['elapsed_time'] = (datetime.datetime.today() - train['first_active_month']).dt.days
    test['elapsed_time'] = (datetime.datetime.today() - test['first_active_month']).dt.days
    
    train["year"] = train["first_active_month"].dt.year
    test["year"] = test["first_active_month"].dt.year
    train["month"] = train["first_active_month"].dt.month
    test["month"] = test["first_active_month"].dt.month
    train['dayofweek'] = train['first_active_month'].dt.dayofweek
    test['dayofweek'] = test['first_active_month'].dt.dayofweek
    train['weekofyear'] = train['first_active_month'].dt.weekofyear
    test['weekofyear'] = test['first_active_month'].dt.weekofyear
    
    train['outliers'] = 0
    train.loc[train['target'] < -30, 'outliers'] = 1
    
    for f in ['feature_1','feature_2','feature_3']:
        order_label = train.groupby([f])['outliers'].mean()
        train[f + "_"] = train[f].map(order_label)
        test[f + "_"] = test[f].map(order_label)
        
    train.drop(['outliers'], axis = 1, inplace=True)
    
    train = pp.hot_encode(train, ["feature_1", "feature_2", "feature_3"])
    test = pp.hot_encode(test, ["feature_1", "feature_2", "feature_3"])
    
    return train, test
示例#2
0
from xgboost import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR, LinearSVR, LinearSVC
from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, Ridge, SGDRegressor, LassoLars
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor

import warnings


def ignore_warn(*args, **kwargs):
    pass


warnings.warn = ignore_warn  #ignore annoying warning (from sklearn and seaborn)

train, test = pp.read_train_test()

train = pp.drop_outliers(train)

all_data = pp.concat_train_test(train.drop(['SalePrice'], axis=1), test)

#ds = ds.drop(['Utilities'], axis=1)
#ds = ds.drop(high_occurance_missing(ds, 0.8), axis=1)

all_data = pp.convert_numeric2category(all_data)
was_missing_columns = pp.handle_missing(all_data)

all_data = pp.encode(all_data)
shrunk_columns = pp.shrink_scales(all_data)
engineered_columns = pp.add_engineered_features(all_data)
simplified_columns = pp.simplify_features(all_data)
示例#3
0
        'colsample_bytree': 0.054,
        'colsample_bylevel': 0.50,
        'n_jobs': -1,
        'random_state': 456
    }
    
fit_params = {
        'early_stopping_rounds': 15,
        'eval_metric': 'rmse',
        'verbose': False
    }

import matplotlib.pyplot as plt
plt.spy(train)

train, test = pp.read_train_test(train_file = 'train.csv', test_file = 'test.csv')

#train_sparse = train.replace(0, np.nan).to_sparse()
#test_sparse = test.replace(0, np.nan).to_sparse()

#train_X_sparse = train_sparse.drop(['ID','target'], axis=1)
#train_y_sparse = (np.log1p(train_sparse.target)).values

ids = list(test.ID)

train_X = train.drop(['ID','target'], axis=1)
train_y = (np.log1p(train.target)).values

test_X = test.drop(['ID'], axis=1)

pipe = Pipeline(
示例#4
0
def load_train_test(nrows=None,
                    silent=True,
                    treat_cat_missing=False,
                    treat_num_missing=False,
                    remove_duplicated_cols=False):
    train, test = pp.read_train_test(train_file='application_train.csv',
                                     test_file='application_test.csv',
                                     nrows=nrows)

    # Remove some rows with values not present in test set
    train = train[train['CODE_GENDER'] != 'XNA']
    train = train[train['NAME_INCOME_TYPE'] != 'Maternity leave']
    train = train[train['NAME_FAMILY_STATUS'] != 'Unknown']

    if not silent:
        print("Train samples: {}, Test samples: {}".format(
            len(train), len(test)))

    # Decrease number of categories in ORGANIZATION_TYPE
    _, cat_cols_train = pp.get_feature_groups(train)
    _, cat_cols_test = pp.get_feature_groups(test)

    if not silent:
        print("Decreading the number of categories...")

    for col in cat_cols_train:
        cat_values_table_train = pp.check_categorical_cols_values(train,
                                                                  col=col)
        s_low_values_train = set(cat_values_table_train[
            cat_values_table_train.loc[:, "% of Total"] < 1].index)

        cat_values_table_test = pp.check_categorical_cols_values(test, col=col)
        s_low_values_test = set(cat_values_table_test[
            cat_values_table_test.loc[:, "% of Total"] < 1].index)

        l_union = list(s_low_values_train.union(s_low_values_test))

        if len(l_union) >= 2:
            if not silent:
                print(
                    "Decreasing the number of categories in {}...".format(col))
                print("The following categories will be grouped: {}".format(
                    l_union))
            train.loc[train[col].isin(l_union), col] = "Other 2"
            test.loc[test[col].isin(l_union), col] = "Other 2"

    train.loc[:,
              'HOUR_APPR_PROCESS_START'] = train.loc[:,
                                                     'HOUR_APPR_PROCESS_START'].astype(
                                                         'object')
    test.loc[:,
             'HOUR_APPR_PROCESS_START'] = test.loc[:,
                                                   'HOUR_APPR_PROCESS_START'].astype(
                                                       'object')

    train = treat_anomalies(train, columns=['DAYS_EMPLOYED'])
    test = treat_anomalies(test, columns=['DAYS_EMPLOYED'])

    train.loc[train['OWN_CAR_AGE'] > 80, 'OWN_CAR_AGE'] = np.nan
    train.loc[train['REGION_RATING_CLIENT_W_CITY'] < 0,
              'REGION_RATING_CLIENT_W_CITY'] = np.nan
    train.loc[train['AMT_INCOME_TOTAL'] > 1e8, 'AMT_INCOME_TOTAL'] = np.nan
    train.loc[train['AMT_REQ_CREDIT_BUREAU_QRT'] > 10,
              'AMT_REQ_CREDIT_BUREAU_QRT'] = np.nan
    train.loc[train['OBS_30_CNT_SOCIAL_CIRCLE'] > 40,
              'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan

    test.loc[test['OWN_CAR_AGE'] > 80, 'OWN_CAR_AGE'] = np.nan
    test.loc[test['REGION_RATING_CLIENT_W_CITY'] < 0,
             'REGION_RATING_CLIENT_W_CITY'] = np.nan
    test.loc[test['AMT_INCOME_TOTAL'] > 1e8, 'AMT_INCOME_TOTAL'] = np.nan
    test.loc[test['AMT_REQ_CREDIT_BUREAU_QRT'] > 10,
             'AMT_REQ_CREDIT_BUREAU_QRT'] = np.nan
    test.loc[test['OBS_30_CNT_SOCIAL_CIRCLE'] > 40,
             'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan

    train['COUNT_MISSING'] = train.isnull().sum(axis=1).values
    test['COUNT_MISSING'] = test.isnull().sum(axis=1).values

    cat_cols = pp.get_dtype_columns(train, [np.dtype(object)])
    cat_cols2encode = [
        c for c in cat_cols if len(train[c].value_counts(dropna=False)) <= 2
    ]

    if not silent:
        print("Label encoding {}".format(cat_cols2encode))

    le = LabelEncoder()
    for col in cat_cols2encode:
        le.fit(train[col])
        train[col] = le.transform(train[col])
        test[col] = le.transform(test[col])

    # CATEGORICAL MISSING
    #print(pp.check_missing(train[pp.get_categorical_missing_cols(train)]))
    #print(pp.check_missing(test[pp.get_categorical_missing_cols(test)]))
    if (treat_cat_missing):
        if not silent:
            print("Treating categoricals missing...")
        train.NAME_TYPE_SUITE.fillna("Unaccompanied", inplace=True)
        test.NAME_TYPE_SUITE.fillna("Unaccompanied", inplace=True)

    # High density missing categorical columns - deserves a column when performing get_dummies
    # FONDKAPREMONT_MODE, WALLSMATERIAL_MODE, HOUSETYPE_MODE, EMERGENCYSTATE_MODE, OCCUPATION_TYPE

    if not silent:
        print("Creating dummies variables...")
    train = pd.get_dummies(train, dummy_na=treat_cat_missing, dtype='bool')
    test = pd.get_dummies(test, dummy_na=treat_cat_missing, dtype='bool')

    train_labels = train['TARGET']
    train, test = train.align(test, join='inner', axis=1)
    train['TARGET'] = train_labels

    # NUMERICAL MISSING
    #print(pp.check_missing(train[pp.get_numerical_missing_cols(train)]))
    #print(pp.check_missing(test[pp.get_numerical_missing_cols(test)]))
    if (treat_num_missing):
        if not silent:
            print("Treating numericals missing...")
        num_missing_trans = pp.HandleMissingMedianTransformer()
        train = num_missing_trans.fit_transform(train)
        test = num_missing_trans.fit_transform(test)

    # FEATURE ENGINEERING
    if not silent:
        print("Feature engineering...")
    train = pp.get_domain_knowledge_features(train)
    test = pp.get_domain_knowledge_features(test)

    if remove_duplicated_cols:
        duplicated_train = pp.duplicate_columns(train,
                                                verbose=not silent,
                                                progress=False)
        if not silent:
            print("Removing duplicated columns {}".format(duplicated_train))
        train.drop(list(duplicated_train.keys()), axis=1, inplace=True)
        test.drop(list(duplicated_train.keys()), axis=1, inplace=True)

    return train, test
示例#5
0
import preprocessing as pp
from sklearn.preprocessing import Binarizer, LabelEncoder


def make_submission(model,
                    X_train,
                    y_train,
                    X_test,
                    filename='submission.csv'):
    model.fit(X_train, y_train)
    predicted = model.predict_proba(test_X)[:, 1]
    my_submission = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': predicted})
    my_submission.to_csv(filename, index=False)


train, test = pp.read_train_test(train_file='application_train.csv',
                                 test_file='application_test.csv')

cat_cols = pp.get_dtype_columns(train, [np.dtype(object)])
cat_cols2encode = [
    c for c in cat_cols if len(train[c].value_counts(dropna=False)) <= 3
]

le = LabelEncoder()
for col in cat_cols2encode:
    le.fit(train[col])
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# CATEGORICAL MISSING
print(pp.check_missing(train[pp.get_categorical_missing_cols(train)]))
print(pp.check_missing(test[pp.get_categorical_missing_cols(test)]))