예제 #1
0
def process_data(trainPath=".", testPath="."):
    subs_csi = pd.read_csv(os.path.join(trainPath, 'subs_csi_train.csv'),
                           index_col='SK_ID')
    subs_features = pd.read_csv(os.path.join(trainPath,
                                             'subs_features_train.csv'),
                                index_col='SK_ID')
    subs_bs_consumption = pd.read_csv(
        os.path.join(trainPath, 'subs_bs_consumption_train.csv'))

    df = subs_csi.merge(subs_features, on="SK_ID")
    df = df.merge(subs_bs_consumption.groupby(
        by=["SK_ID", "MON"], as_index=False).sum().set_index('SK_ID'),
                  on='SK_ID')
    df = sortByDate(df)
    df = df[df['ACT'] == 1]
    X, y = df.drop(columns=[
        "CSI", "SNAP_DATE", "CONTACT_DATE", 'COM_CAT#24', 'ACT', 'MON',
        'CELL_LAC_ID'
    ]), df["CSI"]

    profileReport = ProfileReport(df)
    categorical_features = ['ARPU_GROUP', 'DEVICE_TYPE_ID', 'INTERNET_TYPE_ID']
    binary_features = ['BASE_TYPE', 'COM_CAT#25', 'COM_CAT#26', "CSI"]
    numerical_features = set(
        X.columns) - set(categorical_features) - set(binary_features)

    X[categorical_features] = X[categorical_features].astype('int',
                                                             errors='ignore')

    numerical_features = list(numerical_features)

    X[categorical_features] = X[categorical_features].astype('category',
                                                             errors='ignore')

    X = X.drop(columns=profileReport.get_rejected_variables())
    categorical_features = list(set(X).intersection(categorical_features))
    numerical_features = list(set(X).intersection(numerical_features))
    binary_features = list(set(X).intersection(binary_features))

    classifier_pipeline = Pipeline(steps=[
        (
            'feature_processing',
            ColumnTransformer(transformers=[
                # binary
                ('binary',
                 Pipeline([('impute',
                            SimpleImputer(missing_values=np.nan,
                                          strategy='most_frequent'))]),
                 binary_features),

                # numeric
                ('numeric',
                 Pipeline([(
                     'impute',
                     SimpleImputer(missing_values=np.nan, strategy='mean')
                 ), ('scale', RobustScaler()),
                           ('transform',
                            QuantileTransformer(output_distribution='normal')
                            ), ('engineer', PolynomialFeatures())]),
                 numerical_features),

                # categorical
                ('categorical',
                 Pipeline([(
                     'impute',
                     SimpleImputer(missing_values=np.nan,
                                   strategy='constant',
                                   fill_value=-10000)
                 ), ('toint',
                     FunctionTransformer(lambda x: x.astype('int64')))]),
                 categorical_features),
            ])),
    ])
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)
    X_train = classifier_pipeline.fit_transform(X_train)
    X_test = classifier_pipeline.fit_transform(X_test)
    X_submission_df, X_submission = process_submission_data(
        classifier_pipeline, testPath)
    return X_train, X_test, y_train, y_test, X_submission_df, X_submission
예제 #2
0
# In[13]:


len(X.columns) - len(train_removed_all_once.columns)


# In[72]:


X = train_removed_all_once


# In[73]:


profileReport.get_rejected_variables()


# In[87]:


X = X.drop(columns=['COM_CAT#22','COM_CAT#23', 'COM_CAT#28'])


# In[88]:


categorical_features = list(set(X).intersection(categorical_features))
numerical_features = list(set(X).intersection(numerical_features))
binary_features = list(set(X).intersection(binary_features))
categorical_indices = np.where(X.dtypes == 'category')