def process_data(trainPath=".", testPath="."): subs_csi = pd.read_csv(os.path.join(trainPath, 'subs_csi_train.csv'), index_col='SK_ID') subs_features = pd.read_csv(os.path.join(trainPath, 'subs_features_train.csv'), index_col='SK_ID') subs_bs_consumption = pd.read_csv( os.path.join(trainPath, 'subs_bs_consumption_train.csv')) df = subs_csi.merge(subs_features, on="SK_ID") df = df.merge(subs_bs_consumption.groupby( by=["SK_ID", "MON"], as_index=False).sum().set_index('SK_ID'), on='SK_ID') df = sortByDate(df) df = df[df['ACT'] == 1] X, y = df.drop(columns=[ "CSI", "SNAP_DATE", "CONTACT_DATE", 'COM_CAT#24', 'ACT', 'MON', 'CELL_LAC_ID' ]), df["CSI"] profileReport = ProfileReport(df) categorical_features = ['ARPU_GROUP', 'DEVICE_TYPE_ID', 'INTERNET_TYPE_ID'] binary_features = ['BASE_TYPE', 'COM_CAT#25', 'COM_CAT#26', "CSI"] numerical_features = set( X.columns) - set(categorical_features) - set(binary_features) X[categorical_features] = X[categorical_features].astype('int', errors='ignore') numerical_features = list(numerical_features) X[categorical_features] = X[categorical_features].astype('category', errors='ignore') X = X.drop(columns=profileReport.get_rejected_variables()) categorical_features = list(set(X).intersection(categorical_features)) numerical_features = list(set(X).intersection(numerical_features)) binary_features = list(set(X).intersection(binary_features)) classifier_pipeline = Pipeline(steps=[ ( 'feature_processing', ColumnTransformer(transformers=[ # binary ('binary', Pipeline([('impute', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))]), binary_features), # numeric ('numeric', Pipeline([( 'impute', SimpleImputer(missing_values=np.nan, strategy='mean') ), ('scale', RobustScaler()), ('transform', QuantileTransformer(output_distribution='normal') ), ('engineer', PolynomialFeatures())]), numerical_features), # categorical ('categorical', Pipeline([( 'impute', SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=-10000) ), ('toint', FunctionTransformer(lambda x: x.astype('int64')))]), categorical_features), ])), ]) X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y) X_train = classifier_pipeline.fit_transform(X_train) X_test = classifier_pipeline.fit_transform(X_test) X_submission_df, X_submission = process_submission_data( classifier_pipeline, testPath) return X_train, X_test, y_train, y_test, X_submission_df, X_submission
# In[13]: len(X.columns) - len(train_removed_all_once.columns) # In[72]: X = train_removed_all_once # In[73]: profileReport.get_rejected_variables() # In[87]: X = X.drop(columns=['COM_CAT#22','COM_CAT#23', 'COM_CAT#28']) # In[88]: categorical_features = list(set(X).intersection(categorical_features)) numerical_features = list(set(X).intersection(numerical_features)) binary_features = list(set(X).intersection(binary_features)) categorical_indices = np.where(X.dtypes == 'category')