list(le.classes_) ) #['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', 'More than 100 Days'] #print(list(le.inverse_transform([0, 1, 2,3, 4, 5, 6, 7, 8, 9, 10]))) ## Detect and Handle Outliers columns = [ 'Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'Visitors with Patient', 'Admission_Deposit' ] # for col in columns: # to show outliers # sns.boxplot(x=col, data=df) # sns.stripplot(x=col, data=df, color="#474646") # plt.show() from datasist.structdata import detect_outliers outliers_indices = detect_outliers(df, 0, columns) print(len(outliers_indices)) # handle outliers df.drop(outliers_indices, inplace=True) df.info() ### Deal with Imbalanced classes ## Stay column print(df['Stay'].value_counts()) from sklearn.model_selection import train_test_split x = df.drop('Stay', axis=1) y = df['Stay'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=22) from imblearn.over_sampling import SMOTE
elif x in [6, 7, 8]: return 'Summer' elif x in [9, 10, 11]: return 'Autumn' df['Season'] = df['Month'].apply(map_months) print(df.head(10)) ### Visualization sns.pairplot(df, vars=['temperature', 'humidity', 'windspeed', 'count', 'Season']) plt.show() ################# Detect and Handle Outliers columns =['temperature', 'humidity', 'windspeed'] from datasist.structdata import detect_outliers outliers = detect_outliers(df, 0, columns) print(len(outliers)) # 10 number of rows which contain outliers # delet outliers df.drop(outliers, inplace=True) ## Deal with Categorical Data (Season , month) df = pd.get_dummies(df, columns=['Season'], drop_first=True) print(df.head(10)) ############# Feature Scaling # # #####To split Data to train && test when using all data set linear or non linear relation # # ### when select features # # from sklearn.model_selection import train_test_split print(df['Year']) x = df.drop(['count','date','Month', 'Day','Season_Summer'], axis=1) y = df['count'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=90)