reg = RandomForestRegressor() reg.fit(Xtrain, ytrain) preds = reg.predict(Xval) print( f'The validation RMSLE error for baseline model is: {RMSLE(np.exp(yval), np.exp(preds))}' ) sub_preds = reg.predict(test[cols]) submit['units_sold'] = np.exp(sub_preds) submit.head(2) submit.to_csv('sub_baseline_v1.csv', index=False) from category_encoders import TargetEncoder, MEstimateEncoder encoder = MEstimateEncoder() encoder.fit(train['store_id'], train['units_sold']) train['store_encoded'] = encoder.transform(train['store_id'], train['units_sold']) test['store_encoded'] = encoder.transform(test['store_id'], test['units_sold']) encoder.fit(train['sku_id'], train['units_sold']) train['sku_encoded'] = encoder.transform(train['sku_id'], train['units_sold']) test['sku_encoded'] = encoder.transform(test['sku_id'], test['units_sold']) skus = train.sku_id.unique() print(skus[:2]) test_preds = test.copy() test_preds.tail(2) def sku_model(sku, cols_to_use, reg):
X_encode = df.sample(frac=0.20, random_state=0) y_encode = X_encode.pop("SalePrice") # Training split X_pretrain = df.drop(X_encode.index) y_train = X_pretrain.pop("SalePrice") # YOUR CODE HERE: Create the MEstimateEncoder # Choose a set of features to encode and a value for m encoder = MEstimateEncoder( cols=["Neighborhood"], m=1.0, ) # Fit the encoder on the encoding split encoder.fit(X_encode, y_encode) # Encode the training split X_train = encoder.transform(X_pretrain, y_train) feature = encoder.cols plt.figure(dpi=90) ax = sns.distplot(y_train, kde=True, hist=False) ax = sns.distplot(X_train[feature], color='r', ax=ax, hist=True, kde=False, norm_hist=True) ax.set_xlabel("SalePrice"); X = df.copy() y = X.pop("SalePrice") score_base = score_dataset(X, y) score_new = score_dataset(X_train, y_train)
rcParams['figure.figsize']=10,6 df1=np.log1p(df['UnitPrice']) df.drop("UnitPrice", axis = 1, inplace = True) !pip install category_encoders from category_encoders import MEstimateEncoder df=df.astype({'StockCode': 'category','Description': 'category'}) category_list=['StockCode','Description'] encoder_final=MEstimateEncoder() encoder_final.fit(df[category_list], df1) cat_enc = encoder_final.transform(df[category_list], df1) continuous_train = df.drop(columns= category_list) df = pd.concat([cat_enc,continuous_train],axis=1) test_enc=encoder_final.transform(df2[category_list]) continuous_test=df2.drop(columns= category_list) df2=pd.concat([test_enc,continuous_test],axis=1) df2.head() sns.distplot(np.log1p(df['Quantity'])) from scipy import stats y=np.log1p(df['UnitPrice'])