示例#1
0
    list(le.classes_)
)  #['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100', 'More than 100 Days']
#print(list(le.inverse_transform([0, 1, 2,3, 4, 5, 6, 7, 8, 9, 10])))

## Detect and Handle Outliers
columns = [
    'Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital',
    'Bed Grade', 'Visitors with Patient', 'Admission_Deposit'
]
# for col in columns:     # to show outliers
#     sns.boxplot(x=col, data=df)
#     sns.stripplot(x=col, data=df, color="#474646")
#     plt.show()

from datasist.structdata import detect_outliers
outliers_indices = detect_outliers(df, 0, columns)
print(len(outliers_indices))
# handle outliers
df.drop(outliers_indices, inplace=True)
df.info()

### Deal with Imbalanced classes  ## Stay column
print(df['Stay'].value_counts())
from sklearn.model_selection import train_test_split
x = df.drop('Stay', axis=1)
y = df['Stay']
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=22)
from imblearn.over_sampling import SMOTE
    elif x in [6, 7, 8]:
        return 'Summer'
    elif x in [9, 10, 11]:
        return 'Autumn'

df['Season'] = df['Month'].apply(map_months)
print(df.head(10))

### Visualization
sns.pairplot(df, vars=['temperature', 'humidity', 'windspeed', 'count', 'Season'])
plt.show()
################# Detect and Handle Outliers
columns =['temperature', 'humidity', 'windspeed']

from datasist.structdata import detect_outliers
outliers = detect_outliers(df, 0, columns)
print(len(outliers)) # 10 number of rows which contain outliers
# delet outliers
df.drop(outliers, inplace=True)
## Deal with Categorical Data (Season , month)
df = pd.get_dummies(df, columns=['Season'], drop_first=True)
print(df.head(10))
#############  Feature Scaling
# # #####To split Data to train && test when using all data set linear or non linear relation
# # ### when select features
# #
from sklearn.model_selection import train_test_split
print(df['Year'])
x = df.drop(['count','date','Month', 'Day','Season_Summer'], axis=1)
y = df['count']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=90)