def trainingModel(self): self.log_writer.log(self.file_object, 'Start of Training') try: data_getter = Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) X, Y = preprocessor.separate_label_feature( data, label_column_name='Calories') is_null_present = preprocessor.is_null_present(X) if (is_null_present): X = preprocessor.impute_missing_values(X) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) X = preprocessor.remove_columns(X, cols_to_drop) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=1 / 3, random_state=355) model_finder = Model_Finder(self.file_object, self.log_writer) best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) file_op = File_operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name) self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
def predictFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = Data_Getter_Prediction(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) data = preprocessor.remove_columns(data, cols_to_drop) file_loader = File_operation(self.file_object, self.log_writer) model = file_loader.load_model('my_model') X, y = preprocessor.separate_label_feature(data, 'Calories') result = list(model.predict(X.values)) result = pd.Series(result, name='Predictions') path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, "Prediction_Log", 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ["Unnamed: 0"] ) # remove the unnamed column as it doesn't contribute to prediction. is_null_present = preprocessor.is_null_present(data) # if missing values are there, replace them appropriately. if (is_null_present): data = preprocessor.impute_missing_values( data) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) # drop the columns obtained above data = preprocessor.remove_columns(data, cols_to_drop) """ Applying the clustering approach""" file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') ##Code changed # pred_data = data.drop(['Wafer'],axis=1) clusters = kmeans.predict(data.drop( ['Wafer'], axis=1)) # drops the first column for cluster prediction data['clusters'] = clusters clusters = data['clusters'].unique() for i in clusters: cluster_data = data[data['clusters'] == i] wafer_names = list(cluster_data['Wafer']) cluster_data = data.drop(labels=['Wafer'], axis=1) cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) result = list(model.predict(cluster_data)) result = pandas.DataFrame(list(zip(wafer_names, result)), columns=['Wafer', 'Prediction']) #path = "Predictions.csv" pred_result = result.to_csv(header=True) self.AzureFunc.uploadBlob("predictionoutputfile", "predictions.csv", pred_result) output = self.AzureFunc.readingcsvfile("predictionoutputfile", "predictions.csv") #result.to_csv("Predictions.csv", header=True,mode='a+') # appends result to prediction file self.log_writer.log(self.file_object, "Prediction_Log", 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, "Prediction_Log", 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return output.head().to_json(orient="records")
def trainingModel(self): # Logging the start of Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Start of Training') # Getting the data from the source data_getter = data_loader.Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() """doing the data preprocessing""" preprocessor = Preprocessor(self.file_object, self.log_writer) data = preprocessor.remove_columns( data, ['Wafer', "Unnamed: 0"] ) # remove the unnamed column as it doesn't contribute to prediction. # create separate features and labels X, Y = preprocessor.separate_label_feature( data, label_column_name='Good/Bad') is_null_present = preprocessor.is_null_present(X) # if missing values are there, replace them appropriately. if (is_null_present): X = preprocessor.impute_missing_values( X) # missing value imputation # check further which columns do not contribute to predictions # if the standard deviation for a column is zero, it means that the column has constant values # and they are giving the same output both for good and bad sensors # prepare the list of such columns to drop cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) # drop the columns obtained above X = preprocessor.remove_columns(X, cols_to_drop) """ Applying the clustering approach""" kmeans = clustering.KMeansClustering( self.file_object, self.log_writer) # object initialization. number_of_clusters = kmeans.elbow_plot( X ) # using the elbow plot to find the number of optimum clusters # Divide the data into clusters X = kmeans.create_clusters(X, number_of_clusters) # create a new column in the dataset consisting of the corresponding cluster assignments. X['Labels'] = Y # getting the unique clusters from our dataset list_of_clusters = X['Cluster'].unique() """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster""" for i in list_of_clusters: cluster_data = X[X['Cluster'] == i] # filter the data for one cluster # Prepare the feature and Label columns cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1) cluster_label = cluster_data['Labels'] # splitting the data into training and test set for each cluster one by one x_train, x_test, y_train, y_test = train_test_split( cluster_features, cluster_label, test_size=1 / 3, random_state=355) model_finder = tuner.Model_Finder( self.file_object, self.log_writer) # object initialization # getting the best model for each of the clusters best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) # saving the best model to the directory. file_op = file_methods.File_Operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name + str(i)) # logging the successful Training self.log_writer.log(self.file_object, "ModelTrainingLog", 'Successful End of Training')