def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation # create separate features and labels X, Y = preprocessor.separate_label_feature(data, label_column_name='y') # Dropping column after performing EDA preprocessor_cus = preprocess_cus.Preprocessor_cus( self.file_object, self.log_writer) X = preprocessor_cus.drop_column(X) Y.replace({ "no": 0, "yes": 1 }, inplace=True) # Encoding Y (Predection label) # Response Encoding process # cat_cols2 = preprocessor_cus.categorical_column(X) # X = preprocessor_cus.ResponseEncoder(cat_cols2, X, Y) X = preprocessor_cus.test_data_encode(X) # Using Predefined Values print("Shape of the dataset after encoding: ", X.shape) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation features, label = preprocessor.handleImbalanceDataset(X, Y) # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( features, label, test_size=1 / 5, random_state=42) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns(data, [ 'Wafer']) # remove the unnamed column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature(data, label_column_name='Output') # check if missing values are present in the dataset is_null_present = preprocessor.is_null_present(X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values(X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering(self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot(X) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder(self.file_object, self.log_writer) # object initialization # getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model(x_train, y_train, x_test, y_test) # saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns(data, [ 'policy_number', 'policy_bind_date', 'policy_state', 'insured_zip', 'incident_location', 'incident_date', 'incident_state', 'incident_city', 'insured_hobbies', 'auto_make', 'auto_model', 'auto_year', 'age', 'total_claim_amount' ]) # remove the column as it doesn't contribute to prediction. data.replace( '?', np.NaN, inplace=True) # replacing '?' with NaN values for imputation # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data, cols_with_missing_values) # missing value imputation #encode categorical data data = preprocessor.encode_categorical_columns(data) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='fraud_reported') """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) # Proceeding with more data pre-processing steps x_train = preprocessor.scale_numerical_columns(x_train) x_test = preprocessor.scale_numerical_columns(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction. #removing unwanted columns as discussed in the EDA part in ipynb file #data = preprocessor.dropUnnecessaryColumns(data,['veiltype']) #repalcing '?' values with np.nan as discussed in the EDA part data = preprocessor.replaceInvalidValuesWithNull(data) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data, cols_with_missing_values) # missing value imputation # get encoded values for categorical data #data = preprocessor.encodeCategoricalValues(data) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Result') # drop the columns obtained above #X=preprocessor.remove_columns(X,cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=36) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """ doing the data preprocessing. All the pre processing steps are based on the EDA done previously """ """ 1. Duplicate 2. Remove columns: "serial","rate","listed_in(type)","listed_in(city)" 3. Null removal 4. Convert cost column to number 5. Categorical to Numerical """ preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #removing unwanted columns as discussed in the EDA part in ipynb file data = preprocessor.dropUnnecessaryColumns( data, ["serial", "rate", "listed_in(type)", "listed_in(city)"]) # removing the duplicates data = preprocessor.removeDuplicates(data) # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): # here we won't do any imputation, just to show one more way, we'll drop the missing values data = data.dropna(how='any') # cost value to float data = preprocessor.convertCostToNumber(data) # get encoded values for categorical data data = preprocessor.encodeCategoricalValues(data) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='approx_cost(for two people)') """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=36) x_train_scaled = preprocessor.standardScalingData(x_train) x_test_scaled = preprocessor.standardScalingData(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train_scaled, y_train, x_test_scaled, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise e
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #removing unwanted columns as discussed in the EDA part in ipynb file print('Dropping Unnecessary columns done') data = preprocessor.dropUnnecessaryColumns(data, ['veil-type']) print('Operation Done!!') #repalcing '?' values with np.nan as discussed in the EDA part print('Replace Invalid Values with NULL') data = preprocessor.replaceInvalidValuesWithNull(data) print('Operation Done!!') # check if missing values are present in the dataset print('Getting columns for NULL values') is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) print('Operation Done!!') # if missing values are there, replace them appropriately. if (is_null_present): print('Imputing Missing values!!') data = preprocessor.impute_missing_values( data, cols_with_missing_values) # missing value imputation print('Operation Done') # get encoded values for categorical data data = preprocessor.encodeCategoricalValues(data) data.to_csv('tmp.csv', index=False) # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='class') # drop the columns obtained above #X=preprocessor.remove_columns(X,cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster print(cluster_data.shape) if 'Labels' in cluster_data: print('Labels Column Found') if 'Cluster' in cluster_data: print('Cluster Column Found') # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] print(cluster_label) print('Cluster Label and Features Created') # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, stratify=cluster_label, random_state=365) print('Train Test Split Done!') model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization print('Finding best model for cluster: ', i) #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test, i) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) self.performance_list.extend(model_finder.perf_data) # logging the successful Training print(self.performance_list) print(type(self.performance_list)) print('Inserting Performance Metrics to MongoDB') for dict_l in self.performance_list: self.dbObj.insertOneRecord('mushroomClassifierDB', 'performance_metrics', dict_l) self.log_writer.log(self.file_object, 'Successful End of Training') print('Successfully end training') # Triggering Email msg = MIMEMultipart() msg['Subject'] = 'MushroomTypeClassifier - Model Train | ' + str( datetime.now()) body = 'Model Training Done Successfully. Please find the models in models/ directory... <br><br> Thanks and Regards, <br> Rahul Garg' msg.attach(MIMEText(body, 'html')) to_addr = ['*****@*****.**'] self.emailObj.trigger_mail(to_addr, [], msg) except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training: ' + e) raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, [] ) # remove the column as it doesn't contribute to prediction. data.replace( '?', np.NaN, inplace=True) # replacing '?' with NaN values for imputation # check if missing values are present in the dataset is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data, cols_with_missing_values) # missing value imputation # since in our target column we have multi no labels and it is higly imbalance so i m grouping them using below preprocessor method data = preprocessor.grouping_values_of_target(data) # create separate two data frames one on which we will perform cluster and other is attached after performing cluster X, Y = preprocessor.separate_data_frame( data, label_column_name=['Rings', 'Sex']) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X = pd.concat([X, Y], axis=1, sort=False) # encode categorical data X = preprocessor.encode_categorical_columns(X) # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" df = pd.DataFrame( columns=['Cluster_No', 'Best_Model_Name', 'Roc_Auc_score']) for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Rings', 'Cluster'], axis=1) cluster_label = cluster_data['Rings'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=100) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model, Roc_Auc_score = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) df = df.append( { 'Cluster_No': i, 'Best_Model_Name': best_model_name + str(i), 'Roc_Auc_score': Roc_Auc_score }, ignore_index=True) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() return df except Exception as e: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, 'Start of Training') try: # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction. data = preprocessor.enocdeCategoricalvalues(data) X = data.drop(['class'], axis=1) Y = data['class'] X, Y = preprocessor.handleImbalanceDataset(X, Y) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) #create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) x_train = preprocessor.scaleData(x_train) x_test = preprocessor.scaleData(x_test) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization #getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) #saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: # logging the unsuccessful Training self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception