def test_get_data_jb(): expected = [1, 2, 3, 4] save_data(expected, name='tests/sampletest/data/processed/proce', method='jb') output = get_data(path='tests/sampletest/data/processed/proce.jbl') assert expected == output assert type(expected) == type(output)
# format_version: '1.5' # jupytext_version: 1.5.0 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- import pandas as pd import numpy as np import seaborn as sns import datasist.project as dp import datasist as ds #read data from the raw data directory using datasist data = dp.get_data('train.csv', loc='raw', method='csv') ds.structdata.describe(data) # + #check for missing values ds.structdata.display_missing(data) #seperate the label from the data label = data.Rating data.drop(columns=['Rating'], inplace=True) #Encode all categorical feature with label encoding from sklearn.preprocessing import LabelEncoder lb = LabelEncoder()
# jupytext_version: 1.5.0 # kernelspec: # display_name: Python 3 # language: python # name: python3 # --- # + import datasist.project as dp import numpy as np from sklearn.metrics import mean_squared_error from sklearn.ensemble import RandomForestRegressor from sklearn.model_selection import cross_val_score #retrieve data from the processed folder data = dp.get_data("train_proc.csv", method='csv') label = dp.get_data("train_labels.csv", method='csv') #base model with random forest rf = RandomForestRegressor(n_estimators=10, random_state=2) score = cross_val_score(estimator=rf, X=data, y=label.Rating, cv=5, scoring="neg_mean_squared_error", n_jobs=-1) score = -1 * np.mean(score) print("RMSE is {}".format(score)) #save the model dp.save_model(rf, name='rf_model_n10')