class dataTransform: def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_logger() def replaceMissingWithNull(self): log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: File Transformed successfully!!" % file) except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() log_file.close()
class Prediction: def __init__(self, path): self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') self.log_writer = App_logger() if path is not None: self.pred_data_val = Prediction_data_validation(path) def predictFromModel(self): try: self.pred_data_val.deletePredictionFile() self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = Data_Getter_Prediction(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) is_null_present = preprocessor.is_null_present(data) if (is_null_present): data = preprocessor.impute_missing_values(data) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation( data) data = preprocessor.remove_columns(data, cols_to_drop) file_loader = File_operation(self.file_object, self.log_writer) model = file_loader.load_model('my_model') X, y = preprocessor.separate_label_feature(data, 'Calories') result = list(model.predict(X.values)) result = pd.Series(result, name='Predictions') path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True, mode='a+') self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path, result.head().to_json(orient="records")
class trainModel: def __init__(self): self.log_writer = App_logger() self.file_object = open("Training_Logs/ModelTrainingLog.txt", 'a+') def trainingModel(self): self.log_writer.log(self.file_object, 'Start of Training') try: data_getter = Data_Getter(self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = Preprocessor(self.file_object, self.log_writer) X, Y = preprocessor.separate_label_feature( data, label_column_name='Calories') is_null_present = preprocessor.is_null_present(X) if (is_null_present): X = preprocessor.impute_missing_values(X) cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X) X = preprocessor.remove_columns(X, cols_to_drop) x_train, x_test, y_train, y_test = train_test_split( X, Y, test_size=1 / 3, random_state=355) model_finder = Model_Finder(self.file_object, self.log_writer) best_model_name, best_model = model_finder.get_best_model( x_train, y_train, x_test, y_test) file_op = File_operation(self.file_object, self.log_writer) save_model = file_op.save_model(best_model, best_model_name) self.log_writer.log(self.file_object, 'Successful End of Training') self.file_object.close() except Exception: self.log_writer.log(self.file_object, 'Unsuccessful End of Training') self.file_object.close() raise Exception
class dBOperation: def __init__(self): self.path = 'Prediction_Database/' self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw" self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_logger() def dataBaseConnection(self, DatabaseName): try: conn = sqlite3.connect(self.path + DatabaseName + '.db') file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Error while connecting to database: %s" % ConnectionError) file.close() raise ConnectionError return conn def createTableDb(self, DatabaseName, column_names): try: conn = self.dataBaseConnection(DatabaseName) conn.execute('DROP TABLE IF EXISTS Good_Raw_Data;') for key in column_names.keys(): type = column_names[key] try: conn.execute( 'ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key, dataType=type)) except: conn.execute( 'CREATE TABLE Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) conn.close() file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() except Exception as e: file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Error while creating table: %s " % e) file.close() conn.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() raise e def insertIntoTableGoodData(self, Database): conn = self.dataBaseConnection(Database) goodFilePath = self.goodFilePath badFilePath = self.badFilePath onlyfiles = [f for f in os.listdir(goodFilePath)] log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+') for file in onlyfiles: try: with open(goodFilePath + '/' + file, "r") as f: next(f) reader = csv.reader(f, delimiter="\n") for line in enumerate(reader): for list_ in (line[1]): try: conn.execute('INSERT INTO Good_Raw_Data values ({values})'.format(values=(list_))) self.logger.log(log_file, " %s: File loaded successfully!!" % file) conn.commit() except Exception as e: raise e except Exception as e: conn.rollback() self.logger.log(log_file, "Error while creating table: %s " % e) shutil.move(goodFilePath + '/' + file, badFilePath) self.logger.log(log_file, "File Moved Successfully %s" % file) log_file.close() conn.close() raise e conn.close() log_file.close() def selectingDatafromtableintocsv(self, Database): self.fileFromDb = 'Prediction_FileFromDB/' self.fileName = 'InputFile.csv' log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+') try: conn = self.dataBaseConnection(Database) sqlSelect = "SELECT * FROM Good_Raw_Data" cursor = conn.cursor() cursor.execute(sqlSelect) results = cursor.fetchall() headers = [i[0] for i in cursor.description] if not os.path.isdir(self.fileFromDb): os.makedirs(self.fileFromDb) csvFile = csv.writer(open(self.fileFromDb + self.fileName, 'w', newline=''), delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\') csvFile.writerow(headers) csvFile.writerows(results) self.logger.log(log_file, "File exported successfully!!!") except Exception as e: self.logger.log(log_file, "File exporting failed. Error : %s" % e) raise e
class pred_validation: def __init__(self, path): self.raw_data = Prediction_data_validation(path) self.dataTransform = dataTransformationPredict() self.dBOperation = dBOperation() self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') self.log_writer = App_logger() def prediction_validation(self): try: self.log_writer.log(self.file_object, 'Start of Validation on files for prediction!!') LengthOfDateStampInFile, noofcolumns, column_names = self.raw_data.valuesFromSchema() regex = self.raw_data.manualRegexCreation() self.raw_data.validateFileNameRaw(regex, LengthOfDateStampInFile) self.raw_data.validateColumnLength(noofcolumns) self.raw_data.validateMissingValuesInWholeColumn() self.log_writer.log(self.file_object, "Raw Data Validation Complete!!") self.log_writer.log(self.file_object, ("Starting Data Transforamtion!!")) self.dataTransform.replaceMissingWithNull() self.log_writer.log(self.file_object, "DataTransformation Completed!!!") self.log_writer.log(self.file_object, "Creating Prediction_Database and tables on the basis of given schema!!!") self.dBOperation.createTableDb('Prediction', column_names) self.log_writer.log(self.file_object, "Table creation Completed!!") self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!") self.dBOperation.insertIntoTableGoodData('Prediction') self.log_writer.log(self.file_object, "Insertion in Table completed!!!") self.log_writer.log(self.file_object, "Deleting Good Data Folder!!!") self.raw_data.deleteExistingGoodDataTrainingFolder() self.log_writer.log(self.file_object, "Good_Data folder deleted!!!") self.log_writer.log(self.file_object, "Moving bad files to Archive and deleting Bad_Data folder!!!") self.raw_data.moveBadFilesToArchiveBad() self.log_writer.log(self.file_object, "Bad files moved to archive!! Bad folder Deleted!!") self.log_writer.log(self.file_object, "Validation Operation completed!!") self.log_writer.log(self.file_object, "Extracting csv file from table") self.dBOperation.selectingDatafromtableintocsv('Prediction') except Exception as e: raise e
class Raw_Data_validation: def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_training.json' self.logger = App_logger() def valuesFromSchema(self): try: with open(self.schema_path, 'r') as f: dic = json.load(f) f.close() pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) file.close() except ValueError: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log( file, "ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, str(e)) file.close() raise e return LengthOfDateStampInFile, column_names, NumberofColumns def manualRegexCreation(self): regex = "['data']+['\_'']+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): try: path = os.path.join("Training_Raw_files_validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Training_Raw_files_validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError def deleteExistingGoodDataTrainingFolder(self): try: path = 'Training_Raw_files_validated/' if os.path.isdir(path + 'Good_Raw/'): shutil.rmtree(path + 'Good_Raw/') file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def deleteExistingBadDataTrainingFolder(self): try: path = 'Training_Raw_files_validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log( file, "BadRaw directory deleted before starting validation!!!") file.close() except OSError as s: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def moveBadFilesToArchiveBad(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = 'Training_Raw_files_validated/Bad_Raw/' if os.path.isdir(source): path = "TrainingArchiveBadData" if not os.path.isdir(path): os.makedirs(path) dest = 'TrainingArchiveBadData/BadData_' + str( date) + "_" + str(time) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad files moved to archive") path = 'Training_Raw_files_validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except Exception as e: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise e def validateFileNameRaw(self, regex, LengthOfDateStampInFile): self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in os.listdir(self.Batch_Directory)] try: f = open("Training_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: shutil.copy("Training_Batch_files/" + filename, "Training_Raw_Files_Validated/Good_Raw") self.logger.log( f, "Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: shutil.copy("Training_Batch_files/" + filename, "Training_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Training_Batch_files/" + filename, "Training_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) f.close() except Exception as e: f = open("Training_Logs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberofColumns): try: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Column Length Validation Started!!") for file in listdir('Training_Raw_files_validated/Good_Raw/'): csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) if csv.shape[1] == NumberofColumns: pass else: shutil.move( "Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() def validateMissingValuesInWholeColumn(self): try: f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") for file in listdir('Training_Raw_files_validated/Good_Raw/'): csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 shutil.move( "Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True) except OSError: f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close()