Exemplo n.º 1
0
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV

# %%
sns.set(style='darkgrid', palette='muted')

# %%
os.chdir("data")
df = pd.read_csv("Automobile_data.csv", na_values="?")
df.info()

# %%
df.head()

# %%
null_checker(df)

# %%
# Drop row with missing value of price
df = df[~df["price"].isna()]
null_checker(df)

# %%
df.drop(columns="normalized-losses", inplace=True)

# %% [markdown]
# # EDA

# %%
df.describe()
Exemplo n.º 2
0
# %%
df.info()

# %% [markdown] id="g1GS1AAUZIt9"
# # Preprocessing

# %% execution={"iopub.execute_input": "2020-10-08T14:24:34.751690Z", "iopub.status.busy": "2020-10-08T14:24:34.751690Z", "iopub.status.idle": "2020-10-08T14:24:34.765663Z", "shell.execute_reply": "2020-10-08T14:24:34.762660Z", "shell.execute_reply.started": "2020-10-08T14:24:34.751690Z"} id="INV8VvOYZItN"
# Delete outlier
df = df[~(df.Kilometers_Driven > 1e6)]
df.shape

# %% id="TYqvFHW1HqFX"
# Drop missing values
df= df.dropna()
null_checker(df)

# %% [markdown] id="yEgVyyNSZIt9"
# ## Train test split

# %% execution={"iopub.execute_input": "2020-10-08T14:24:51.747335Z", "iopub.status.busy": "2020-10-08T14:24:51.747335Z", "iopub.status.idle": "2020-10-08T14:24:51.759305Z", "shell.execute_reply": "2020-10-08T14:24:51.757306Z", "shell.execute_reply.started": "2020-10-08T14:24:51.747335Z"} id="nPxFt6bSZIt-" outputId="50d71945-3c1c-4fe9-bb86-b9ae483a319b"
# melakukan train test split di awal untuk mencegah data leakage
X = df.drop(columns=['Price'])
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

# %% [markdown] id="oxqsMHrKZIuA"
# ## Encoding

# %%
# Define category mapping for label encoding