class data_getter: def __init__(self, file_object): self.logger = App_Logger() self.log_file = file_object def data_load(self, file): self.logger.log(self.log_file, "Entering into DATA GETTER METHOD") ''' Method Name: data_load Description: This method loads the data from the file and convert into a pandas dataframe Output: Returns a Dataframes, which is our data for training On Failure: Raise Exception . ''' try: self.logger.log( self.log_file, "Now we are starting data gathering from the file source") data = pd.read_csv(file, na_values='?') self.logger.log( self.log_file, "Now we have gathered the data frome the source and converted it into a pandas dataframe" ) return data except Exception as e: self.logger.log(self.log_file, "oops!!Data gathering not succesful") raise e
class preprocess: def __init__(self, file): self.logger = App_Logger() self.file = file def gather(self): log_file = open( r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt', "a+") try: self.logger.log(log_file, "DATA is being gathered ") auto_data = pd.read_csv(self.file, header=None, na_values="?") self.logger.log(log_file, "DATA gathering completed ") auto_data.columns = [ "symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinder', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price' ] self.logger.log(log_file, "columns for data set has been set") log_file.close() return auto_data except Exception as e: self.logger.log(log_file, "Files gathering is not succesful") log_file.close() raise e def set_types(self, auto_data): log_file = open( r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt', "a+") try: log_file = open("./Training_logs/preprocessing_logs.txt", "a+") self.logger.log( log_file, "Now we will set the types of data into required") auto_data["normalized-losses"] = auto_data[ "normalized-losses"].astype("float") auto_data["bore"] = auto_data["bore"].astype("float") auto_data["stroke"] = auto_data["stroke"].astype("float") auto_data["horsepower"] = auto_data["horsepower"].astype("float") auto_data["peak-rpm"] = auto_data["peak-rpm"].astype("float") auto_data["price"] = auto_data["price"].astype("float") self.logger.log(log_file, "DATA Types has been set for each feature") log_file.close() return auto_data except Exception as e: self.logger.log(log_file, "setting data types was not completed") log_file.close() raise e def imputation(self, auto_data): log_file = open( r'C:\Users\poorvi\Desktop\auto_project\Training_logs\training_preprocessing_logs.txt', "a+") try: self.logger.log( log_file, "Now we will remove the missing values from the data") num_col = auto_data.select_dtypes(include=[np.number]).columns num_col.drop("price") imputer = SimpleImputer() imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(auto_data[num_col]) auto_data[num_col] = imputer.transform(auto_data[num_col]) self.logger.log( log_file, "missing values imputation for numerical data is done ,,,,, Now we will handle the target variable" ) auto_data.dropna(subset=["price"], axis=0, inplace=True) auto_data.reset_index(drop=True, inplace=True) cat_col = auto_data.select_dtypes(exclude=[np.number]).columns imputer = SimpleImputer() imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imputer.fit(auto_data[cat_col]) auto_data[cat_col] = imputer.transform(auto_data[cat_col]) self.logger.log(log_file, "Imputations of missing values is done----") auto_data.to_csv( r"C:\Users\poorvi\Desktop\auto_project\Training_preprocessing\preprocessed_file.csv" ) log_file.close() except Exception as e: self.logger.log(log_file, "Imputation of missing values failed") log_file.close() raise e
def __init__(self, file): self.logger = App_Logger() self.file = file
def __init__(self, data, file_object): self.log_file = file_object self.data = data self.logger = App_Logger()
class model_fit: def __init__(self, data, file_object): self.log_file = file_object self.data = data self.logger = App_Logger() def training(self): ''' Method Name: training Description: This method TRAINS TEH PREPROCESSED DATA FOR THE BEST MODEL Output: Returns a best model for predictions On Failure: Raise Exception . ''' try: self.logger.log(self.log_file, "Entering into training method ") self.logger.log( self.log_file, "Now we willl firstly split the data into training and testing set" ) X = self.data.drop('price', axis=1) Y = self.data['price'] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=3) self.logger.log(self.log_file, "Dataset splitting succesfully done") ##fitting with random forest regressor() self.logger.log( self.log_file, "Now we will fit the randomforestregressor on the training and test set" ) rf = RandomForestRegressor() rf.fit(x_train, y_train) self.logger.log( self.log_file, "Randomforestregressr fitted succesfully on the training set") ##Now applying tuning on the randomforestregressor self.logger.log( self.log_file, "Now we will perfrom hyperparameter tuning on the randomforestregressor for better results" ) self.logger.log(self.log_file, "Now we are setting best paramterers range") # Number of trees in random forest n_estimators = [ int(x) for x in np.linspace(start=100, stop=1200, num=12) ] # Number of features to consider at every split max_features = ['auto', 'sqrt'] # Maximum number of levels in tree max_depth = [int(x) for x in np.linspace(5, 30, num=6)] # max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 10, 15, 100] # Minimum number of samples required at each leaf node min_samples_leaf = [1, 2, 5, 10] self.logger.log(self.log_file, "Best parameters ranged succesfullly") random_grid = { 'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf } rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid, scoring='neg_mean_squared_error', n_iter=10, cv=5, random_state=42, n_jobs=1) self.logger.log( self.log_file, "Randomized search cv done on randomforestregressor") rf_random.fit(x_train, y_train) self.logger.log( self.log_file, "fitting model with best parameters on the training set") joblib.dump(rf_random, 'model.pkl') self.logger.log(self.log_file, "saving the best model") self.log_file.close() except Exception as e: self.logger.log( self.log_file, "looks like there is some error in model training !!!try with removing errors" ) self.log_file.close() raise e
class tuning: def __init__(self): self.logger = App_Logger() def tuning_xgboost(self,x_train,y_train,x_test,y_test): log_file = open(r"./Training_logs/training_model_tuning_logs.txt", "a+") try: self.logger.log(log_file,"Nowe will tune the xgboost regressor with GridSearchCV") xgr = xgboost() self.logger.log(log_file,"Now setting parameter range") params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)], 'subsample':[i/10.0 for i in range(6,11)],'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]} self.logger.log(log_file,"Estimating the best parameters for xgboost") grid = GridSearchCV(xgr, params) self.logger.log(log_file,"Best parameters estimation succesful") grid.fit(x_train,y_train) self.logger.log(log_file,"NOW fitting tuned model on the training set") y = grid.best_estimator_ log_file.close() return y.score(x_test,y_test) except Exception as e: self.logger.log(log_file,"TUNING xgboost not succesful") log_file.close() raise e def tuning_rf(self,x_train,y_train,x_test,y_test): log_file = open(r"./Training_logs/training_model_tuning_logs.txt", "a+") try: self.logger.log(log_file,"Nowe will tune the randomforest regressor with RandomizedSearchCV") rf = RandomForestRegressor() self.logger.log(log_file,"Now setting parameter range") random_grid ={'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]} self.logger.log(log_file,"Estimating the best parameters for randomforest") rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 10, cv = 5, random_state=42, n_jobs = 1) self.logger.log(log_file,"Best parameters estimation succesful") self.logger.log(log_file,"NOW fitting tuned model on the training set") rf_random.fit(x_train,y_train) joblib.dump(rf_random,r"C:\Users\poorvi\Desktop\auto_project\model.pkl") log_file.close() except Exception as e: self.logger.log(log_file,"TUNING xgboost not succesful") log_file.close() raise e
def __init__(self): self.logger = App_Logger()
def __init__(self, file_object): self.logger = App_Logger() self.log_file = file_object self.logger.log(self.log_file, "Now we are starting the preprocessing of the data")
class preprocess: def __init__(self, file_object): self.logger = App_Logger() self.log_file = file_object self.logger.log(self.log_file, "Now we are starting the preprocessing of the data") def set_columns(self, data): """ Method Name: set_columns Description: This method Sets the coloumn names for each of the columns Output: Returns a Dataframes, one in which columns indexes are proper On Failure: Raise Exception . """ self.logger.log( self.log_file, "Now firstly we will set the names for each column i.e column index" ) try: data.columns = [ "symboling", "normalized-losses", "make", "fuel-type", "aspiration", "num-of-doors", "body-style", 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinder', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price' ] self.logger.log(self.log_file, "COLumn index set for each features succesfully") return data except Exception as e: self.logger.log( self.log_file, "oops!! column index for the columns can not be succesfully set" ) raise e def target(self, data): """ Method Name: target Description: This method will return the target variable further Output: Returns a series os target variable On Failure: Raise Exception . """ self.logger.log( self.log_file, "This method will return the target variable further for model building " ) try: self.logger.log(self.log_file, "Firstly preprocesing the target variable") data.dropna(subset=["price"], axis=0, inplace=True) self.logger.log( self.log_file, "Now setting indexes back to normal after droping missing values rows from target variable" ) data.reset_index(drop=True, inplace=True) self.logger.log(self.log_file, "target variable preprocessing done") return data except Exception as e: self.logger.log(self.log_file, "Target variable preprocessing not succesful") raise e def remove_columns(self, data): """ Method Name: remove_columns Description: This method removes unncessary columns from the data Output: Returns a Dataframes, one in There are only important features On Failure: Raise Exception . """ self.logger.log( self.log_file, "Now we come to the third step of preprocessing i.e removing unnecessary columns" ) try: self.logger.log( self.log_file, "Here we are reomving some unnnecessary columns from the data which are of no use in the model building " ) useful_data = data[[ "length", "width", 'horsepower', 'curb-weight', "engine-size", "city-mpg", "highway-mpg", 'drive-wheels', 'num-of-cylinder', 'price' ]] self.logger.log( self.log_file, "we have succesfully removed our unnnecessary columns ") return useful_data except Exception as e: self.logger.log( self.log_file, "Removal fo unncessary columns was not successful") raise e def set_type(self, data): """ Method Name: set_type Description: This method set the data type oof each column corectly Output: Returns a Dataframes, one in which there are correct data type of each feature On Failure: Raise Exception . """ self.logger.log( self.log_file, "Now we are entering to third preprocessing step i.e setting correct daat type for each feature" ) try: self.logger.log( self.log_file, "Here we are setting required data types for each column and then returning correct dataframe" ) data.length = data.length.astype('float') data.width = data.width.astype('float') data.horsepower = data.horsepower.astype('float') data['curb-weight'] = data['curb-weight'].astype('float') data['engine-size'] = data['engine-size'].astype('float') data['city-mpg'] = data['city-mpg'].astype('float') data['highway-mpg'] = data['highway-mpg'].astype('float') data['drive-wheels'] = data['drive-wheels'].astype('object') data['num-of-cylinder'] = data['num-of-cylinder'].astype('object') self.logger.log( self.log_file, "we have succesfully set the correct data type for each column" ) return data except Exception as e: self.logger.log( self.log_file, "looks like there is some error occured in setting data types for each columns" ) raise e def imputation(self, data): """ Method Name: imputation Description: This method removes null or missing values from the dataset Output: Returns a Dataframes, one in which there are no missing values On Failure: Raise Exception . """ self.logger.log( self.log_file, "Now we are starting the next step of preprocessing i.e imputation of missing values" ) try: self.logger.log( self.log_file, "NOW WE are starting to impute missing values as per reuirements on the columns" ) num_col = data.select_dtypes( include=[np.number]).columns.drop('price') cat_col = data.select_dtypes(exclude=[np.number]).columns imputer = SimpleImputer(missing_values=np.nan, strategy='mean') self.logger.log( self.log_file, "imputing the numerical columns Nan VALUES WITH MEAN") imputer.fit(data[num_col]) data[num_col] = imputer.transform(data[num_col]) imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') self.logger.log( self.log_file, "nOW WE ARE IMPUTING THE CATEGORICAL COLUMNS MISSING VALUES WITH MODE" ) data[cat_col] = imputer.fit_transform((data[cat_col])) self.logger.log(self.log_file, "IMPUTATION OF MISSING VALUES IS COMPLETED") return data except Exception as e: self.logger.log( self.log_file, "LOOKS LIKE THERE IS SOME ERROR IN IMPUTING ISSING VALUES") raise e def feature_remove(self, data): """ Method Name: feature_remove Description: This method removes some columns by replacing them with new columns Output: Returns a Dataframes, one in which columns are added and some are removed inplace of them On Failure: Raise Exception . """ self.logger.log( self.log_file, "now we have entered in the step of feature removal or adding") try: self.logger.log( self.log_file, "Now we will add some featurs new and remove some old features" ) data['area'] = data['length'] * data['width'] data['miles'] = data['city-mpg'] - data['highway-mpg'] self.logger.log(self.log_file, "adding two new features area and miles") data.drop('length', inplace=True, axis=1) data.drop('width', inplace=True, axis=1) data.drop('city-mpg', inplace=True, axis=1) data.drop('highway-mpg', inplace=True, axis=1) self.logger.log(self.log_file, "removing four old features on their places") arrange_data = data[[ 'horsepower', 'curb-weight', 'engine-size', 'drive-wheels', 'num-of-cylinder', 'miles', 'area', 'price' ]] self.logger.log(self.log_file, "feature engineering completed succesfully") return arrange_data except Exception as e: self.logger.log(self.log_file, "featuree engineering unsuccesful") raise e def scaling(self, data): """ Method Name: scaling Description: This method Scales all the numerical features into a same range Output: Returns a Dataframes, one in which all the numerical columns are in same range On Failure: Raise Exception . """ self.logger.log( self.log_file, "In this step we are gonna scale all numerical features in the same range" ) try: self.logger.log( self.log_file, "here we have started scaling the features with MinMaxScaler]") num_col = data.select_dtypes( include=[np.number]).columns.drop('price') sc = MinMaxScaler() data[num_col] = sc.fit_transform(data[num_col]) data['num-of-cylinder'].replace( { "three": "eight", "twelve": "eight" }, inplace=True) self.logger.log( self.log_file, "Now here we have scaled all the numerical features in the same range" ) return data except Exception as e: self.logger.log(self.log_file, "oops!! feature scaling not succesfull") raise e def encoding(self, data): """ Method Name: encoding Description: This method encodes the categorical features into numerical for machine learning algortihms Output: Returns a Dataframes, one in encoded columns for categorical columns are introduced On Failure: Raise Exception . """ self.logger.log( self.log_file, "Now it is the end step of preprocessing i.e encoding categorical variables" ) try: self.logger.log( self.log_file, "here we are using dummy variables function for encoding categorical features" ) encoded_data = pd.get_dummies(data, drop_first=True) self.logger.log(self.log_file, "encoding categorical feature done succesfully") return encoded_data except Exception as e: self.logger.log( self.log_file, "oops!! encoding categorical features can not be succesfully done" ) raise e
##importing required libraries from flask import Flask, flash, render_template, request, redirect import flask_monitoringdashboard as dashboard from werkzeug.utils import secure_filename import csv from predictionfolder.prediction import predict from logs.logger import App_Logger import requests import pandas as pd import joblib from retraining import retraining ##for logging flask_log = App_Logger() file_object = open('./flask_logs.txt', 'a+') flask_log.log(file_object, "starting user interface") ##setting allowed files criteria flask_log.log(file_object, "setting allowed files extensions for file input") ALLOWED_EXTENSIONS = set(['csv', 'xlsx', 'data']) UPLOAD_FOLDER = './Charts' ##function to check whether file is is allowed extensions or not def allowed_file(filename): file_object = open('./flask_logs.txt', 'a+') flask_log.log(file_object, "checking if file is in correct extension") return '.' in filename and filename.rsplit( '.', 1)[1].lower() in ALLOWED_EXTENSIONS
class model_building: log_file = open("./Training_logs/training_model_building_logs.txt", "a+") def __init__(self, file): self.logger = App_Logger() self.file = file def data_splitting(self): log_file = open("./Training_logs/training_model_building_logs.txt", "a+") ''' Method Name: data_splitting Description: This method loads the data and splits it into train and test Output: Returns training and testing set On Failure: Raise Exception . ''' try: data = pd.read_csv(self.file) self.logger.log(log_file, "Data splitting is now started") X = data.drop("price", axis=1) Y = data['price'] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=3) self.logger.log(log_file, "Data is now splitted into training and test set") log_file.close() return x_train, y_train, x_test, y_test except Exception as e: self.logger.log(log_file, "Data splitting is not finished") log_file.close() raise e def randomforest_reg(self, x_train, y_train, x_test, y_test): log_file = open("./Training_logs/training_model_building_logs.txt", "a+") ''' Method Name: randomforest_reg Description: This method fits the randomforest regressor on the training data Output: Returns a Dataframes, which is our data for training On Failure: Raise Exception . ''' try: self.logger.log( log_file, "Now we will fit RandomForestRegressor in the training set") rf = RandomForestRegressor() rf.fit(x_train, y_train) ypred1 = rf.predict(x_test) self.logger.log( log_file, "RandomForestRegressor is now fitted on to the training set") log_file.close() return rf.score(x_test, y_test) except Exception as e: self.logger.log(log_file, "Model fitting randomforest not succesful") log_file.close() raise e def xgboost_reg(self, x_train, y_train, x_test, y_test): log_file = open("./Training_logs/training_model_building_logs.txt", "a+") ''' Method Name: data_load Description: This method loads the data from the file and convert into a pandas dataframe Output: Returns a Dataframes, which is our data for training On Failure: Raise Exception . ''' try: self.logger.log( log_file, "Now we will fit Xgboostregressor in the training set") xg = RandomForestRegressor() xg.fit(x_train, y_train) ypred1 = xg.predict(x_test) self.logger.log( log_file, "Xgboostregressor is now fitted on to the training set") log_file.close() return xg.score(x_test, y_test) except Exception as e: self.logger.log(log_file, "Model fitting xgboost not succesful") log_file.close() raise e
def __init__(self, file_object): self.logger = App_Logger() self.log_file = file_object