def __init__(self, path): self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') self.log_writer = App_Logger() self.pred_data_val = Prediction_Data_Validation(path)
class dBOperation: """ This class shall be used for handling all the SQL operations. """ def __init__(self): self.path = 'Training_Database/' self.badFilePath = "Training_Raw_files_validated/Bad_Raw" self.goodFilePath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def dataBaseConnection(self, DatabaseName): """ Method Name: dataBaseConnection Description: This method creates the database with the given name and if Database already exists then opens the connection to the DB. Output: Connection to the DB On Failure: Raise ConnectionError """ try: conn = sqlite3.connect(self.path + DatabaseName + '.db') file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Error while connecting to database: %s" % ConnectionError) file.close() raise ConnectionError return conn def createTableDb(self, DatabaseName, column_names): """ Method Name: createTableDb Description: This method creates a table in the given database which will be used to insert the Good data after raw data validation. Output: None On Failure: Raise Exception """ try: conn = self.dataBaseConnection(DatabaseName) c = conn.cursor() c.execute( "SELECT count(name) FROM sqlite_master WHERE type = 'table'AND name = 'Good_Raw_Data'" ) if c.fetchone()[0] == 1: conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Closed %s database successfully" % DatabaseName) file.close() else: for key in column_names.keys(): type = column_names[key] #in try block we check if the table exists, if yes then add columns to the table # else in catch block we will create the table try: #cur = cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Good_Raw_Data'".format(dbName=DatabaseName)) conn.execute( 'ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}' .format(column_name=key, dataType=type)) except: conn.execute( 'CREATE TABLE Good_Raw_Data ({column_name} {dataType})' .format(column_name=key, dataType=type)) # try: # #cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Bad_Raw_Data'".format(dbName=DatabaseName)) # conn.execute('ALTER TABLE Bad_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type)) # # except: # conn.execute('CREATE TABLE Bad_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Closed %s database successfully" % DatabaseName) file.close() except Exception as e: file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Error while creating table: %s " % e) file.close() conn.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() raise e def insertIntoTableGoodData(self, Database): """ Method Name: insertIntoTableGoodData Description: This method inserts the Good data files from the Good_Raw folder into the above created table. Output: None On Failure: Raise Exception """ conn = self.dataBaseConnection(Database) goodFilePath = self.goodFilePath badFilePath = self.badFilePath onlyfiles = [f for f in listdir(goodFilePath)] log_file = open("Training_Logs/DbInsertLog.txt", 'a+') for file in onlyfiles: try: with open(goodFilePath + '/' + file, "r") as f: next(f) reader = csv.reader(f, delimiter="\n") for line in enumerate(reader): for list_ in (line[1]): try: conn.execute( 'INSERT INTO Good_Raw_Data values ({values})' .format(values=(list_))) self.logger.log( log_file, " %s: File loaded successfully!!" % file) conn.commit() except Exception as e: raise e except Exception as e: conn.rollback() self.logger.log(log_file, "Error while creating table: %s " % e) shutil.move(goodFilePath + '/' + file, badFilePath) self.logger.log(log_file, "File Moved Successfully %s" % file) log_file.close() conn.close() conn.close() log_file.close() def selectingDatafromtableintocsv(self, Database): """ Method Name: selectingDatafromtableintocsv Description: This method exports the data in GoodData table as a CSV file. in a given location. above created . Output: None On Failure: Raise Exception """ self.fileFromDb = 'Training_FileFromDB/' self.fileName = 'InputFile.csv' log_file = open("Training_Logs/ExportToCsv.txt", 'a+') try: conn = self.dataBaseConnection(Database) sqlSelect = "SELECT * FROM Good_Raw_Data" cursor = conn.cursor() cursor.execute(sqlSelect) results = cursor.fetchall() # Get the headers of the csv file headers = [i[0] for i in cursor.description] #Make the CSV ouput directory if not os.path.isdir(self.fileFromDb): os.makedirs(self.fileFromDb) # Open CSV file for writing. csvFile = csv.writer(open(self.fileFromDb + self.fileName, 'w', newline=''), delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\') # Add the headers and data to the CSV file. csvFile.writerow(headers) csvFile.writerows(results) self.logger.log(log_file, "File exported successfully!!!") log_file.close() except Exception as e: self.logger.log(log_file, "File exporting failed. Error : %s" % e) log_file.close()
def __init__(self, path): self.Batch_Directory = path #my_file = rootProjPath+'\\schema_training.json' #self.schema_path = my_file self.schema_path = 'schema_training.json' self.logger = App_Logger()
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger()
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_training.json' self.logger = App_Logger()
def __init__(self): self.path = 'Prediction_Database/' self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw" self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger()
class Training_Model: def __init__( self, models_list, sampling_method, ): self.logger_object = App_Logger() self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+') self.sampling_method = sampling_method self.models_list = models_list def train_model(self): file = open('Training_Logs/General_Log.txt', 'a+') self.logger_object.log( file, 'Entered train_model() method of Training_Model class') file.close() try: data_getter = Data_Getter(self.file_object, self.logger_object) data = data_getter.get_data() preprocessor = PreProcessor(self.file_object, self.logger_object, self.sampling_method) data = preprocessor.remove_null(data) data = preprocessor.clean_reviews(data) data = preprocessor.remove_StopWords(data) data = preprocessor.remove_punctuations(data) data = preprocessor.pos_tagging_lemmatizeText(data) data = preprocessor.encode_label(data) x, y = preprocessor.separate_feature_label(data) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=100) x_train, x_test = preprocessor.count_vectorizer(x_train, x_test) #x_train,x_test = preprocessor.tfidf_vectorizer(x_train,x_test) x_train, x_test = preprocessor.tfidfTransformer_vectorizer( x_train, x_test) if self.sampling_method == 'us': x_train, y_train = preprocessor.under_sampling( x_train, y_train) elif self.sampling_method == 'os': x_train, y_train = preprocessor.over_sampling(x_train, y_train) elif self.sampling_method == 'no': pass else: pass tuner = Model_Tuner(self.file_object, self.logger_object) self.trained_models_dict = { 'svm': None, 'rf': None, 'xg': None, 'mnb': None } for m in self.models_list: if m == 'svm': self.trained_models_dict['svm'] = tuner.get_params_svm( x_train, y_train) elif m == 'rf': self.trained_models_dict[ 'rf'] = tuner.get_params_for_RandomForest( x_train, y_train) elif m == 'xg': self.trained_models_dict[ 'xg'] = tuner.get_best_params_for_XGBoost( x_train, y_train) elif m == 'mnb': self.trained_models_dict[ 'mnb'] = tuner.get_params_bagging_naive_bayes( x_train, y_train) else: pass model_evaluation = Model_Evaluation(self.trained_models_dict, x_test, y_test, self.file_object, self.logger_object) self.model_evaluation_report_dict = model_evaluation.generate_models_evaluation_report_dict( self.trained_models_dict) self.ordered_model_evaluation_report_dict = sorted( self.model_evaluation_report_dict.items(), key=lambda x: x[1]['f1_score'], reverse=True) for m in self.ordered_model_evaluation_report_dict: model_to_save = m[0] break file_operation = File_Operation(self.file_object, self.logger_object) is_model_saved = file_operation.save_model( self.trained_models_dict[model_to_save], model_to_save) if (is_model_saved == 'success'): self.logger_object.log(self.file_object, 'Successfull End of Training') else: self.logger_object.log( self.file_object, 'Error while saving model to models directory') return is_model_saved except Exception as e: self.logger_object.log(self.file_object, 'Unsuccessfull End of Training') self.file_object.close() raise e
class Prediction_Data_validation: """ This class shall be used for handling all the validation done on the Raw Prediction Data!!. Written By: Rajat Bisoi Version: 1.0 Revisions: None """ def __init__(self): # self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() self.mongo = To_mongo_db('wafer') self.aws = Aws_Bucket_operation( local_file_name_address='config/bucket_name') def valuesFromSchema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ try: # with open(self.schema_path, 'r') as f: # dic = json.load(f) # f.close() id = self.mongo.Get_ID('schema_wafer_prediction', 'temp_db') dic = self.mongo.downlaod_from_mongo_raw('schema_wafer_prediction', 'temp_db', id[0]) pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] # file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log('wafer_log', message) # file.close() except ValueError: # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log( 'wafer_log', "ValueError:Value not found inside schema_training.json") # 'wafer_log'.close() raise ValueError except KeyError: # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log('wafer_log', "KeyError:Key value error incorrect key passed") # 'wafer_log'.close() raise KeyError except Exception as e: # 'wafer_log' = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log('wafer_log', str(e)) # 'wafer_log'.close() raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the prediction data. Output: Regex pattern On Failure: None Written By: Rajat Bisoi Version: 1.0 Revisions: None """ regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv" return regex # Not used def createDirectoryForGoodBadRawData(self): """ Method Name: createDirectoryForGoodBadRawData Description: This method creates directories to store the Good Data and Bad Data after validating the prediction data. Output: None On Failure: OSError Written By: Rajat Bisoi Version: 1.0 Revisions: None """ try: path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError # Not used def deleteExistingGoodDataTrainingFolder(self): """ Method Name: deleteExistingGoodDataTrainingFolder Description: This method deletes the directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output: None On Failure: OSError Written By: Rajat Bisoi Version: 1.0 Revisions: None """ try: path = 'Prediction_Raw_Files_Validated/' # if os.path.isdir("ids/" + userName): # if os.path.isdir(path + 'Bad_Raw/'): # shutil.rmtree(path + 'Bad_Raw/') if os.path.isdir(path + 'Good_Raw/'): shutil.rmtree(path + 'Good_Raw/') file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError # Not used def deleteExistingBadDataTrainingFolder(self): """ Method Name: deleteExistingBadDataTrainingFolder Description: This method deletes the directory made to store the bad Data. Output: None On Failure: OSError Written By: Rajat Bisoi Version: 1.0 Revisions: None """ try: path = 'Prediction_Raw_Files_Validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log( file, "BadRaw directory deleted before starting validation!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError #Not used def moveBadFilesToArchiveBad(self): """ Method Name: moveBadFilesToArchiveBad Description: This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. Output: None On Failure: OSError Written By: Rajat Bisoi Version: 1.0 Revisions: None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: path = "PredictionArchivedBadData" if not os.path.isdir(path): os.makedirs(path) source = 'Prediction_Raw_Files_Validated/Bad_Raw/' dest = 'PredictionArchivedBadData/BadData_' + str( date) + "_" + str(time) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad files moved to archive") path = 'Prediction_Raw_Files_Validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except OSError as e: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise OSError def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name: validationFileNameRaw Description: This function validates the name of the prediction csv file as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. # self.deleteExistingBadDataTrainingFolder() # self.deleteExistingGoodDataTrainingFolder() # self.createDirectoryForGoodBadRawData() self.mongo.Delete_collection('temp_db', 'wafer_bad_data_prediction') self.mongo.Delete_collection('temp_db', 'wafer_good_data_prediction') # onlyfiles = [f for f in listdir(self.Batch_Directory)] ''' try: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Good_Raw") self.logger.log(f,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log(f,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log(f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) f.close() ''' bucket_inst = self.aws.Create_S3_Bucket_Instance( bucket_prefix='wafer-prediction') try: # f = open("Training_Logs/nameValidationLog.txt", 'a+') for obj in bucket_inst.objects.all(): data = self.aws.Download_From_S3(obj.key) if (re.match(regex, obj.key)): splitAtDot = re.split('.csv', obj.key) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw") self.mongo.send_to_mongo( 'wafer_good_data_prediction', 'temp_db', data) self.logger.log( 'wafer_log', f'file {obj.key} uploaded to collection wafer_good_data' ) else: # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") data = data.to_json() data = json.loads(data) self.mongo.send_to_mongo_raw( 'wafer_bad_data_prediction', 'temp_db', data) self.logger.log( 'wafer_log', f'invalid file name {obj.key} uploaded to collection wafer_bad_data' ) else: # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") data = data.to_json() data = json.loads(data) self.mongo.send_to_mongo_raw( 'wafer_bad_data_prediction', 'temp_db', data) self.logger.log( 'wafer_log', f'invalid file name {obj.key} uploaded to collection wafer_bad_data' ) else: # shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") data = data.to_json() data = json.loads(data) self.mongo.send_to_mongo_raw('wafer_bad_data_prediction', 'temp_db', data) self.logger.log( 'wafer_log', f'invalid file name {obj.key} uploaded to collection wafer_bad_data' ) except Exception as e: # f = open("Prediction_Logs/nameValidationLog.txt", 'a+') self.logger.log('wafer_log', "Error occured while validating FileName %s" % e) # f.close() raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ ''' try: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f,"Column Length Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) if csv.shape[1] == NumberofColumns: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) else: shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log(f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() ''' try: # f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log('wafer_log', "Column Length Validation Started!!") idx = self.mongo.Get_ID('wafer_good_data_prediction', 'temp_db') for file in idx: # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) try: testfile = self.mongo.downlaod_one_from_mongo( 'wafer_good_data_prediction', 'temp_db', file) except Exception as err: try: testfile = self.mongo.downlaod_from_mongo_raw( 'wafer_good_raw_prediction', 'temp_db', file) except Exception as err1: self.mongo.Move_data_in_collections( 'wafer_good_data', 'wafer_bad_data_prediction', 'temp_db', file) self.logger.log( 'wafer_log', "Invalid Column Length for the file !! File moved to " "wafer_Bad_Raw_prediction collection ") raise [err, err1] testfile = pd.DataFrame(testfile) if testfile.shape[1] == NumberofColumns: pass else: # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.mongo.Move_data_in_collections( 'wafer_good_data_prediction', 'wafer_bad_data_prediction', 'temp_db', file) self.logger.log( 'wafer_log', "Invalid Column Length for the file !! File moved to " "wafer_Bad_Raw_prediction collection ") self.logger.log('wafer_log', "Column Length Validation Completed!!") except OSError: # f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log('wafer_log', f"Error Occured while moving the file {OSError}") # f.close() raise OSError except Exception as e: # f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log("wafer_log", f"Error Occured {e}") # f.close() raise e def deletePredictionFile(self): # if os.path.exists('Prediction_Output_File/Predictions.csv'): # os.remove('Prediction_Output_File/Predictions.csv') self.mongo.Delete_collection('temp_db', 'prediction_output') def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: Rajat Bisoi Version: 1.0 Revisions: None """ ''' try: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len(csv[columns]): count+=1 shutil.move("Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log(f,"Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count==0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) except OSError: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() ''' try: # f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log('wafer_log', "Missing Values Validation Started!!") idx = self.mongo.Get_ID('wafer_good_data_prediction', 'temp_db') for file in idx: # csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) testfile = self.mongo.downlaod_one_from_mongo( 'wafer_good_data_prediction', 'temp_db', file) testfile = pd.DataFrame(testfile) count = 0 for columns in testfile: if (len(testfile[columns]) - testfile[columns].count()) == len( testfile[columns]): count += 1 # shutil.move("Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.mongo.Move_data_in_collections( 'wafer_good_data_prediction', 'wafer_bad_data_prediction', 'temp_db', file) self.logger.log( 'wafer_log', f"Invalid Column Length for the file!! File moved to wafer_bad_data_prediction :: {file}" ) break if count == 0: # testfile.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) # testfile.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True) self.mongo.send_to_mongo('wafer_good_data_prediction', 'temp_db', testfile) except OSError: # f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( 'wafer_log', "Error Occured while moving the file :: %s" % OSError) # f.close() raise OSError except Exception as e: # f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log('wafer_log', "Error Occured:: %s" % e) # f.close() raise e
class Prediction_Data_validation: """ This class shall be used for handling all the validation done on the Raw Prediction Data!!. """ def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() self.awsObj = AwsStorageManagement() self.dbObj = mongoDBOperation() def valuesFromSchema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception """ try: if not self.dbObj.isCollectionPresent('mushroomClassifierDB', 'predict_schema'): with open(self.schema_path, 'r') as f: dic = json.load(f) f.close() self.dbObj.insertOneRecord('mushroomClassifierDB', 'predict_schema', dic) dic = self.dbObj.getRecords('mushroomClassifierDB', 'predict_schema') pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] file = 'valuesfromSchemaValidationLog' message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) except ValueError: file = 'valuesfromSchemaValidationLog' self.logger.log( file, "ValueError:Value not found inside schema_training.json") raise ValueError except KeyError: file = 'valuesfromSchemaValidationLog' self.logger.log(file, "KeyError:Key value error incorrect key passed") raise KeyError except Exception as e: file = 'valuesfromSchemaValidationLog' self.logger.log(file, str(e)) raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the prediction data. Output: Regex pattern On Failure: None """ regex = "['mushroom']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ Method Name: createDirectoryForGoodBadRawData Description: This method creates directories to store the Good Data and Bad Data after validating the prediction data. Output: None On Failure: Exception """ try: self.awsObj.createS3Directory( 'Prediction_Good_Raw_Files_Validated') self.awsObj.createS3Directory('Prediction_Bad_Raw_Files_Validated') except Exception as ex: file = 'GeneralLog' self.logger.log(file, "Error while creating Directory %s:" % ex) def deleteExistingGoodDataTrainingFolder(self): """ Method Name: deleteExistingGoodDataTrainingFolder Description: This method deletes the directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output: None On Failure: Exception """ try: file = 'GeneralLog' self.logger.log(file, "GoodRaw directory deleted successfully!!!") self.awsObj.deleteDirectory('Prediction_Good_Raw_Files_Validated') except Exception as s: file = 'GeneralLog' self.logger.log(file, "Error while Deleting Directory : %s" % s) raise s def deleteExistingBadDataTrainingFolder(self): """ Method Name: deleteExistingBadDataTrainingFolder Description: This method deletes the directory made to store the bad Data. Output: None On Failure: Exception """ try: file = 'GeneralLog' self.logger.log( file, "BadRaw directory deleted before starting validation!!!") self.awsObj.deleteDirectory('Prediction_Bad_Raw_Files_Validated') except Exception as s: file = 'GeneralLog' self.logger.log(file, "Error while Deleting Directory : %s" % s) raise s def moveBadFilesToArchiveBad(self): """ Method Name: moveBadFilesToArchiveBad Description: This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. Output: None On Failure: Exception """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: target_folder = 'PredictionArchivedBadData/BadData_' + str( date) + "_" + str(time) self.awsObj.copyFileToFolder('Prediction_Bad_Raw_Files_Validated', target_folder) file = 'GeneralLog' self.logger.log(file, "Bad files moved to archive") self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") except Exception as e: file = 'GeneralLog' self.logger.log(file, "Error while moving bad files to archive:: %s" % e) raise e def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name: validationFileNameRaw Description: This function validates the name of the prediction csv file as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception """ # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() self.createDirectoryForGoodBadRawData() batch_dir = self.Batch_Directory.strip('/').strip('\\') print('Prediction File Path: ', batch_dir) self.awsObj.uploadFiles(batch_dir, batch_dir) onlyfiles = self.awsObj.listDirFiles(batch_dir) try: f = 'nameValidationLog' for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: self.awsObj.copyFileToFolder( batch_dir, 'Prediction_Good_Raw_Files_Validated', filename) self.logger.log( f, "Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: self.awsObj.copyFileToFolder( self.Batch_Directory, 'Prediction_Bad_Raw_Files_Validated', filename) self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: self.awsObj.copyFileToFolder( self.Batch_Directory, 'Prediction_Bad_Raw_Files_Validated', filename) self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: self.awsObj.copyFileToFolder( self.Batch_Directory, 'Prediction_Bad_Raw_Files_Validated', filename) self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) except Exception as e: f = 'nameValidationLog' self.logger.log(f, "Error occured while validating FileName %s" % e) raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception """ try: f = 'columnValidationLog' self.logger.log(f, "Column Length Validation Started!!") file_list = self.awsObj.listDirFiles( 'Prediction_Good_Raw_Files_Validated') for file in file_list: csv = self.awsObj.csvToDataframe( 'Prediction_Good_Raw_Files_Validated', file) if csv.shape[1] == NumberofColumns: self.awsObj.saveDataframeToCsv( 'Prediction_Good_Raw_Files_Validated', file, csv) else: self.awsObj.moveFileToFolder( 'Prediction_Good_Raw_Files_Validated', 'Prediction_Bad_Raw_Files_Validated', file) self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: f = 'columnValidationLog' self.logger.log( f, "Error Occurred while moving the file :: %s" % OSError) raise OSError except Exception as e: f = 'columnValidationLog' self.logger.log(f, "Error Occurred:: %s" % e) raise e def deletePredictionFile(self): self.awsObj.deleteFile('Prediction_Output_File', 'Predictions.csv') def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception """ try: f = 'missingValuesInColumn' self.logger.log(f, "Missing Values Validation Started!!") file_list = self.awsObj.listDirFiles( 'Prediction_Good_Raw_Files_Validated') for file in file_list: csv = self.awsObj.csvToDataframe( 'Prediction_Good_Raw_Files_Validated', file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 self.awsObj.moveFileToFolder( 'Prediction_Good_Raw_Files_Validated', 'Prediction_Bad_Raw_Files_Validated', file) self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: self.awsObj.saveDataframeToCsv( 'Prediction_Good_Raw_Files_Validated', file, csv) except OSError: f = 'missingValuesInColumn' self.logger.log( f, "Error Occurred while moving the file :: %s" % OSError) raise OSError except Exception as e: f = 'missingValuesInColumn' self.logger.log(f, "Error Occurred:: %s" % e) raise e
import pandas as pd from utility import TextSumarization import argparse from application_logging.logger import App_Logger import time logger_object = App_Logger() general_logs = open("Logs/generallogs.txt", '+a') success_file = open("Logs/successlogs.txt", '+a') error_file = open("Logs/errorlogs.txt", '+a') def run(): parser = argparse.ArgumentParser( description='Summarization of TextData using pretrained models') parser.add_argument('-path', dest='path', default='Final_news.csv', help='File path') parser.add_argument( '-model', dest='model', default='bert-base-uncased', help='[xlnet-base-cased,distilbert-base-uncased,albert-base-v1]') args = parser.parse_args() try: logger_object.log( general_logs, "Entered the runner file --> Running the script now !") if not args.path: raise RuntimeError("Must supply text path.")
class trainValidation: def __init__(self,path): self.raw_data = Raw_Data_Validation(path) self.dataTransform = dataTransform() #No Use self.dBOperation = dBOperation() self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+') self.log_writer = App_Logger() def train_validation(self): try: self.log_writer.log(self.file_object,'Start of Validation on files for prediction!!') # extracting values from prediction schema LengthOfDateStampInFile, LengthOfTimeStampInFile,column_names,noofcolumns = self.raw_data.valuesFromSchema() # getting the regex defined to validate filename regex = self.raw_data.manualRegexCreation() # validating filename of prediction files self.raw_data.validationFileNameRaw(regex,LengthOfDateStampInFile,LengthOfTimeStampInFile) # validating column length in the file print(regex) print('hello') print(noofcolumns) self.raw_data.validateColumnLength(noofcolumns) # validating if any column has all values missing self.raw_data.validateMissingValuesInWholeColumn() self.log_writer.log(self.file_object, "Raw Data Validation Complete!!") # storing in the database self.log_writer.log(self.file_object, "Creating Training_Database and tables on the basis of given schema!!!") # create database with given name, if present open the connection! Create table with columns given in schema self.dBOperation.createTableDb('Training', column_names) self.log_writer.log(self.file_object, "Table creation Completed!!") self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!") #insert into the database self.dBOperation.insertIntoTableGoodData('Training') self.log_writer.log(self.file_object, "Insertion in Table completed!!!") self.log_writer.log(self.file_object, "Deleting Good Data Folder!!!") # Delete the good data folder after loading files in table self.raw_data.deleteExistingGoodDataTrainingFolder() self.log_writer.log(self.file_object, "Good_Data folder deleted!!!") self.log_writer.log(self.file_object, "Moving bad files to Archive and deleting Bad_Data folder!!!") # Move the bad files to archive folder self.raw_data.movebadFilesToArchiveBad() self.log_writer.log(self.file_object, "Bad files moved to archive!! Bad folder Deleted!!") self.log_writer.log(self.file_object, "Validation Operation completed!!") self.log_writer.log(self.file_object, "Extracting csv file from table") # export data in table to csvfile self.dBOperation.selectingDatafromtableintocsv('Training') self.file_object.close() except Exception as e: raise e
class Prediction_Data_Validation: ''' This class will be used for validation on the raw Testing data or Prediction data @author Niranjan version : 1.0 Revision : None ''' def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() def valuesFromSchema(self): """ Method Name : valuesFromSchema Description : This method extracts all the relevant information from the predefined "Schema" file Output : LengthOfDateStampInFile, LengthOfTimeStampInFile, NumberofColumns,ColName On Failure : Raise ValueError, KeyError, Exception Author : Niranjan Version : 1.0 Revision : None """ try: with open(self.schema_path, 'r') as file: dic = json.load(file) file.close() pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt', 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) file.close() except ValueError: file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt', 'a+') self.logger.log(file, " ValueError: Value not found inside schema") file.close() raise ValueError except KeyError: file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt', 'a+') self.logger.log(file, " KeyError: Key value error incorrect key passed") raise KeyError except Exception as e: file = open('Prediction_Logs/valuesfromSchemaValidationLog.txt', 'a+') self.logger.log(file, str(e)) raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name : manualRegexCreation Description : This method contains a manually defines regex based on the "Filename" given in "Schema" File This Regex is used to validate the filename of the prediction data. Output : Regex pattern On Failure: None Author : Niranjan Version : 1.0 Revision : None """ regex = r"['cement_strength']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ Method Name : createDirectoryForGoodBadRawData Description : This method creates directories to store the Good Data and Bad Data after validating the prediction data. Output : None On Failure: OsError Author : Niranjan Version : 1.0 Revision : None """ try: path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError def deleteExistingGoodDataTrainingFolder(self): """ Method Name : deleteExistingGoodDataTrainingFolder Description : This method deltes directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output : None On Failure: OsError Author : Niranjan Version : 1.0 Revision : None """ try: path = 'Prediction_Raw_Files_validated/' if os.path.isdir(path + "Good_Raw/"): shutil.rmtree(path + "Good_Raw/") file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Good_Raw directory delted successfully !!!") file.close() except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Good_Raw Directory %s:" % ex) file.close() raise OSError def deleteExistingBadDataTrainingFolder(self): """ Method Name : deleteExistingBadDataTrainingFolder Description : This method deletes directory made to store the Bad Data. Output : None On Failure: OsError Author : Niranjan Version : 1.0 Revision : None """ try: path = 'Prediction_Raw_Files_validated/' if os.path.isdir(path + "Bad_Raw/"): shutil.rmtree(path + "Bad_Raw/") file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad_Raw directory delted successfully !!!") file.close() except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Bad_Raw Directory %s:" % ex) file.close() raise OSError def moveBadFilesToArchiveBad(self): """ Method Name : movebadFilesToArchiveBad Description : This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue.. Output : None On Failure: OsError Author : Niranjan Version : 1.0 Revision : None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = 'Prediction_Raw_Files_validated/Bad_Raw/' if os.path.isdir(source): path = 'PredictionArchivedBadData' if not os.path.isdir(path): os.makedirs(path) dest = 'PredictionArchivedBadData/BadData_' + str( date) + '_' + str(time) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) #move each file to destination file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "All Bad files are moved archive folder") path = 'Prediction_Raw_Files_validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except OSError as e: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise e def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name : validationFileNameRaw Description : This function validates the name of the training csv files as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output : None On Failure: Exception Author : Niranjan Version : 1.0 Revision : None """ # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() # create new directories self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in os.listdir(self.Batch_Directory)] try: # create new directories # self.createDirectoryForGoodBadRawData() f = open("Prediction_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if re.match(regex, filename): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[2]) == LengthOfDateStampInFile: if len(splitAtDot[3]) == LengthOfTimeStampInFile: shutil.copy( "Prediction_Batch_Files/" + filename, "Prediction_Raw_Files_validated/Good_Raw") self.logger.log( f, "Valid File Name !! File moved Good Raw folder :: %s" % filename) else: shutil.copy( "Prediction_Batch_Files/" + filename, "Prediction_Raw_Files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name !! File moved Good Raw folder :: %s" % filename) else: shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_Raw_Files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name !! File moved Good Raw folder :: %s" % filename) else: shutil.copy("Prediction_Batch_Files/" + filename, "Prediction_Raw_Files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name !! File moved Good Raw folder :: %s" % filename) f.close() except Exception as e: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberofColumns): """ Method Name : validateColumnLength Description : This function validates the number of of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. Output : None On Failure: OSError, Exception Author : Niranjan Version : 1.0 Revision : None """ try: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Column Length validation started !!") for file in os.listdir('Prediction_Raw_Files_validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_validated/Good_Raw/" + file) # print(NumberofColumns) # print(csv.shape[1]) if csv.shape[1] == NumberofColumns: pass else: shutil.move( "Prediction_Raw_Files_validated/Good_Raw/" + file, "Prediction_Raw_Files_validated/Bad_Raw/" + file) self.logger.log( f, "Invalid column length!! File moved to Bad_Raw folder") self.logger.log(f, "Column Length validation completed") f.close() #checkpoint except OSError as e: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error occured while moving the file :: %s" % e) f.close() except Exception as e: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error occured:: %s" % e) f.close() def deletePredictionFile(self): if os.path.exists('Prediction_Output_File/Predictions.csv'): os.remove('Prediction_Output_File/Predictions.csv') def validateMissingValuesInWholeColumn(self): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: Niranjan Version: 1.0 Revisions: None """ try: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") for file in listdir('Prediction_Raw_Files_validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 shutil.move( "Prediction_Raw_Files_validated/Good_Raw/" + file, "Prediction_Raw_Files_validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Prediction_Raw_Files_validated/Good_Raw/" + file, index=None, header=True) except OSError: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Malini Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: Malini Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath+"/" + file) #data = self.removeHyphenFromColumnNames(data) # for col in data.columns: # # if col in column: # add quotes in string value data['Id'] = data["Id"].apply(lambda x: "'" + str(x) + "'") data['ActivityDate'] = data["ActivityDate"].apply(lambda x: "'" + str(x) + "'") # if col not in column: # add quotes to '?' values in integer/float columns # for column in data.columns: # count = data[column][data[column] == '?'].count() # if count != 0: # data[column] = data[column].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath+ "/" + file, index=None, header=True) self.logger.log(log_file," %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close() # def removeHyphenFromColumnNames(self,data): # """ # Method Name: addQuotesToStringValuesInColumn # Description: This method changing the column names by replacing the '-'. # # Written By: Malini # Version: 1.0 # Revisions: None # # """ # log_file = open("Training_Logs/removeHyphenFromColumnNames.txt", 'a+') # try: # # # there are "hyphen" in our column name which results in failure when inserting the column names in the table # # so we are changing the column names by replacing the '-' # for col in data.columns: # new_col = col.replace('-', '') # data.rename(columns={col: new_col},inplace=True) # return data # # except Exception as e: # self.logger.log(log_file, "Data Transformation failed because:: %s" % e) # #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") # log_file.close() # log_file.close() # return data
def __init__(self): # self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger('wafer') self.mongo = To_mongo_db()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) # list of columns with string datatype variables column = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source', 'Class'] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply(lambda x: "'" + str(x) + "'") if col not in column: # add quotes to '?' values in integer/float columns data[col] = data[col].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) # csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() self.awsObj = AwsStorageManagement() self.dbObj = mongoDBOperation()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna("'NULL'", inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: File Transformed successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) str_column = [ "potential_issue", "deck_risk", "oe_constraint", "ppap_risk", "stop_auto_buy", "rev_stop", "went_on_backorder" ] for col in data.columns: if col in str_column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") # if col not in column: # add quotes to '?' values in integer/float columns # for column in data.columns: # count = data[column][data[column] == '?'].count() # if count != 0: # data[column] = data[column].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
def __init__(self): self.training_file='../training_file/Churn_Modelling.csv' self.file_object = open("../logs/filereadlogs/log.txt", 'a+') self.logger = App_Logger()
class Prediction_Data_validation: """ This class shall be used for handling all the validation done on the Raw Prediction Data!!. """ def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() def valuesFromSchema(self): """ This method extract as all the relevant information from the pre-defined "Schema" file. :return: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Numberofcolumns """ try: with open(self.schema_path, 'r') as f: dic = json.load(f) f.close() LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) file.close() except ValueError: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log( file, "ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, str(e)) file.close() raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the prediction data. :return: Regex pattern """ # "SampleFileName": "creditCardFraud_021119920_010222.csv" regex = "['creditCardFraud']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ This method creates directories to store the Good Data and Bad data after validating the prediction :return: None """ try: path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError def deleteExistingGoodDataPredictionFolder(self): """ This method deletes the directory made to store the good data after loading in the table. Once the good files are loaded in DB, deleting the directory ensures space optimization. :return: None """ try: path = 'Prediction_Raw_Data_Validated/Good_Raw/' if os.path.isdir(path): shutil.rmtree(path) file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def deleteExistingBadDataPredictionFolder(self): """ This method deletes the directory made to store the good data after loading in the table. Once the good files are loaded in DB, deleting the directory ensures space optimization. :return: None """ try: path = 'Prediction_Raw_Data_Validated/Bad_Raw/' if os.path.isdir(path): shutil.rmtree(path) file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad_Raw directory deleted successfully!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def moveBadFilesToArchiveBad(self): """ This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. :return: None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: path = "PredictionArchiveBadData" if not os.path.isdir(path): os.makedirs(path) source = "Prediction_Raw_Files_Validated/Bad_Raw/" dest = 'PredictionArchiveBadData/BadData_' + str(time) + '_' + str( date) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad files moved to archive") if os.path.isdir(source): shutil.rmtree(source) self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except OSError as e: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise OSError def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ This function validates the name of prediction csv file as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Ra Data folder else Good Raw Data. :param regex: :param LengthOfDateStampInFile: :param LengthOfTimeStampInFile: :return: """ #delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.deleteExistingBadDataPredictionFolder() self.deleteExistingGoodDataPredictionFolder() self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in listdir(self.Batch_Directory)] try: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: shutil.copy( "Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Good_Raw") self.logger.log( f, "Valid File name !! File moved to Good_Raw Folder :: %s" % filename) else: shutil.copy( "Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File name !! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File name !! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) f.close() except Exception as e: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberofColumns): """ This function validates the number of the columns in the csv files as per given in schema. if not same then file is moved to Bad Raw data else kept in Good Raw data. :param NumberofColumns: :return: """ try: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Column Length Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) if csv.shape[1] == NumberofColumns: csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) else: shutil.move( "Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() def deletePredictionFile(self): if os.path.exists("Prediction_Output_File/Predictions.csv"): os.remove('Prediction_Output_File/Predictions.csv') def validateMissingValuesInWholeColumn(self): """ This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. Such files are moved to Bad Raw data. :return: None """ try: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 shutil.move( "Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) except OSError: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occurred:: %s" % e) f.close() raise e f.close()
class dBOperation: def __init__(self): self.path = 'Prediction_Database/' self.badFilePath = "Prediction_Raw_Files_Validated/Bad_Raw" self.goodFilePath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def dataBaseConnection(self, DatabaseName): try: conn = sqlite3.connect(self.path + DatabaseName + '.db') file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Error while connecting to database: %s" % ConnectionError) file.close() raise ConnectionError return conn def createTableDb(self, DatabaseName, column_names): try: conn = self.dataBaseConnection(DatabaseName) conn.execute('DROP TABLE IF EXISTS Good_Raw_Data;') for key in column_names.keys(): type = column_names[key] try: conn.execute( 'ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}' .format(column_name=key, dataType=type)) except: conn.execute( 'CREATE TABLE Good_Raw_Data ({column_name} {dataType})' .format(column_name=key, dataType=type)) conn.close() file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() except Exception as e: file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Error while creating table: %s " % e) file.close() conn.close() file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() raise e def insertIntoTableGoodData(self, Database): conn = self.dataBaseConnection(Database) goodFilePath = self.goodFilePath badFilePath = self.badFilePath onlyfiles = [f for f in listdir(goodFilePath)] log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+') for file in onlyfiles: try: with open(goodFilePath + '/' + file, "r") as f: next(f) reader = csv.reader(f, delimiter="\n") for line in enumerate(reader): for list_ in (line[1]): try: conn.execute( 'INSERT INTO Good_Raw_Data values ({values})' .format(values=(list_))) self.logger.log( log_file, " %s: File loaded successfully!!" % file) conn.commit() except Exception as e: raise e except Exception as e: conn.rollback() self.logger.log(log_file, "Error while creating table: %s " % e) shutil.move(goodFilePath + '/' + file, badFilePath) self.logger.log(log_file, "File Moved Successfully %s" % file) log_file.close() conn.close() raise e conn.close() log_file.close() def selectingDatafromtableintocsv(self, Database): self.fileFromDb = 'Prediction_FileFromDB/' self.fileName = 'InputFile.csv' log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+') try: conn = self.dataBaseConnection(Database) sqlSelect = "SELECT * FROM Good_Raw_Data" cursor = conn.cursor() cursor.execute(sqlSelect) results = cursor.fetchall() #Get the headers of the csv file headers = [i[0] for i in cursor.description] #Make the CSV ouput directory if not os.path.isdir(self.fileFromDb): os.makedirs(self.fileFromDb) # Open CSV file for writing. csvFile = csv.writer(open(self.fileFromDb + self.fileName, 'w', newline=''), delimiter=',', lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\') # Add the headers and data to the CSV file. csvFile.writerow(headers) csvFile.writerows(results) self.logger.log(log_file, "File exported successfully!!!") except Exception as e: self.logger.log(log_file, "File exporting failed. Error : %s" % e) raise e
class Prediction_Data_validation: def __init__(self, path): self.Batch_Directory = path self.schema_path = 'schema_prediction.json' self.logger = App_Logger() def valuesFromSchema(self): try: with open(self.schema_path, 'r') as f: dic = json.load(f) f.close() pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) file.close() except ValueError: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log( file, "ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: file = open("Prediction_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, str(e)) file.close() raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): regex = "['wafer']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): try: path = os.path.join("Prediction_Raw_Files_Validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Prediction_Raw_Files_Validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError def deleteExistingGoodDataTrainingFolder(self): try: path = 'Prediction_Raw_Files_Validated/' # if os.path.isdir("ids/" + userName): # if os.path.isdir(path + 'Bad_Raw/'): # shutil.rmtree(path + 'Bad_Raw/') if os.path.isdir(path + 'Good_Raw/'): shutil.rmtree(path + 'Good_Raw/') file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def deleteExistingBadDataTrainingFolder(self): try: path = 'Prediction_Raw_Files_Validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log( file, "BadRaw directory deleted before starting validation!!!") file.close() except OSError as s: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def moveBadFilesToArchiveBad(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: path = "PredictionArchivedBadData" if not os.path.isdir(path): os.makedirs(path) source = 'Prediction_Raw_Files_Validated/Bad_Raw/' dest = 'PredictionArchivedBadData/BadData_' + str( date) + "_" + str(time) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad files moved to archive") path = 'Prediction_Raw_Files_Validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except OSError as e: file = open("Prediction_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise OSError def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in listdir(self.Batch_Directory)] try: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: shutil.copy( "Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Good_Raw") self.logger.log( f, "Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: shutil.copy( "Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("Prediction_Batch_files/" + filename, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) f.close() except Exception as e: f = open("Prediction_Logs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberofColumns): try: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Column Length Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) if csv.shape[1] == NumberofColumns: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) else: shutil.move( "Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() def deletePredictionFile(self): if os.path.exists('Prediction_Output_File/Predictions.csv'): os.remove('Prediction_Output_File/Predictions.csv') def validateMissingValuesInWholeColumn(self): try: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") for file in listdir('Prediction_Raw_Files_Validated/Good_Raw/'): csv = pd.read_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 shutil.move( "Prediction_Raw_Files_Validated/Good_Raw/" + file, "Prediction_Raw_Files_Validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) csv.to_csv("Prediction_Raw_Files_Validated/Good_Raw/" + file, index=None, header=True) except OSError: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: f = open("Prediction_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) #list of columns with string datatype variables column = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source', 'Class' ] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") if col not in column: # add quotes to '?' values in integer/float columns data[col] = data[col].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger()
class train_validation: def __init__(self, path): self.raw_data = Raw_Data_Validation(path) self.dataTransform = dataTransform() self.dBOperation = dBOperation() self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+') self.log_writer = App_Logger() def train_validation(self): try: self.log_writer.log( self.file_object, 'Start of Validation on files for prediction!!') LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, noofcolumns = self.raw_data.valuesFromSchema( ) print("column names:{}".format(column_names)) regex = self.raw_data.manualRegexCreation() self.raw_data.validationFileNameRaw(regex, LengthOfDateStampInFile, LengthOfTimeStampInFile) self.raw_data.validateColumnLength(noofcolumns) self.raw_data.validateMissingValuesInWholeColumn() self.log_writer.log(self.file_object, "Raw Data Validation Complete!!") self.log_writer.log( self.file_object, "Creating Training_Database and tables on the basis of given schema!!!" ) self.dBOperation.createTableDb(DataBaseName="Training", column_names=column_names) self.log_writer.log(self.file_object, "Table creation Completed!!") self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!") self.dBOperation.insertIntoTableGoodData('Training') self.log_writer.log(self.file_object, "Insertion in Table completed!!!") self.log_writer.log(self.file_object, "Deleting Good Data Folder!!!") self.raw_data.deleteExistingGoodDataTrainingFolder() self.log_writer.log(self.file_object, "Good_Data folder deleted!!!") self.log_writer.log( self.file_object, "Moving bad files to Archive and deleting Bad_Data folder!!!") self.raw_data.moveBadFilesToArchiveBad() self.log_writer.log( self.file_object, "Bad files moved to archive!! Bad folder Deleted!!") self.log_writer.log(self.file_object, "Validation Operation completed!!") self.log_writer.log(self.file_object, "Extracting csv file from table") df = self.dBOperation.selectingDatafromtableintocsv('Training') self.file_object.close() return df except Exception as e: self.log_writer.log(self.file_object, "Conversion to input csv failed") self.file_object.close() raise e
def __init__(self): self.path = 'Training_Database/' self.badFilePath = "Training_Raw_files_validated/Bad_Raw" self.goodFilePath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger()
def __init__(self, path): self.raw_data = Raw_Data_Validation(path) self.dataTransform = dataTransform() self.dBOperation = dBOperation() self.file_object = open("Training_Logs/Training_Main_Log.txt", 'a+') self.log_writer = App_Logger()
class Raw_Data_validation: """ This class shall be used for handling all the validation done on the Raw Training Data!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self, path): self.Batch_Directory = path #my_file = rootProjPath+'\\schema_training.json' #self.schema_path = my_file self.schema_path = 'schema_training.json' self.logger = App_Logger() def valuesFromSchema(self): """ Method Name: valuesFromSchema Description: This method extracts all the relevant information from the pre-defined "Schema" file. Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns On Failure: Raise ValueError,KeyError,Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: with open(self.schema_path, 'r') as f: dic = json.load(f) f.close() pattern = dic['SampleFileName'] LengthOfDateStampInFile = dic['LengthOfDateStampInFile'] LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile'] column_names = dic['ColName'] NumberofColumns = dic['NumberofColumns'] #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt' #file = open(my_file, 'a+') file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n" self.logger.log(file, message) file.close() except ValueError: #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt' #file = open(my_file, 'a+') file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log( file, "ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: #THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt' #file = open(my_file, 'a+') file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: #my_file = self.rootProjPath+'\\Training_Logs\\valuesfromSchemaValidationLog.txt' #file = open(my_file, 'a+') file = open("Training_Logs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, str(e)) file.close() raise e return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns def manualRegexCreation(self): """ Method Name: manualRegexCreation Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file. This Regex is used to validate the filename of the training data. Output: Regex pattern On Failure: None Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ regex = "['cardio']+['\_'']+[\d_]+[\d]+\.csv" return regex def createDirectoryForGoodBadRawData(self): """ Method Name: createDirectoryForGoodBadRawData Description: This method creates directories to store the Good Data and Bad Data after validating the training data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: path = os.path.join("Training_Raw_files_validated/", "Good_Raw/") if not os.path.isdir(path): os.makedirs(path) path = os.path.join("Training_Raw_files_validated/", "Bad_Raw/") if not os.path.isdir(path): os.makedirs(path) except OSError as ex: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while creating Directory %s:" % ex) file.close() raise OSError def deleteExistingGoodDataTrainingFolder(self): """ Method Name: deleteExistingGoodDataTrainingFolder Description: This method deletes the directory made to store the Good Data after loading the data in the table. Once the good files are loaded in the DB,deleting the directory ensures space optimization. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: path = 'Training_Raw_files_validated/' # if os.path.isdir("ids/" + userName): # if os.path.isdir(path + 'Bad_Raw/'): # shutil.rmtree(path + 'Bad_Raw/') if os.path.isdir(path + 'Good_Raw/'): shutil.rmtree(path + 'Good_Raw/') file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def deleteExistingBadDataTrainingFolder(self): """ Method Name: deleteExistingBadDataTrainingFolder Description: This method deletes the directory made to store the bad Data. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: path = 'Training_Raw_files_validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log( file, "BadRaw directory deleted before starting validation!!!") file.close() except OSError as s: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while Deleting Directory : %s" % s) file.close() raise OSError def moveBadFilesToArchiveBad(self): """ Method Name: moveBadFilesToArchiveBad Description: This method deletes the directory made to store the Bad Data after moving the data in an archive folder. We archive the bad files to send them back to the client for invalid data issue. Output: None On Failure: OSError Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = 'Training_Raw_files_validated/Bad_Raw/' if os.path.isdir(source): path = "TrainingArchiveBadData" if not os.path.isdir(path): os.makedirs(path) dest = 'TrainingArchiveBadData/BadData_' + str( date) + "_" + str(time) if not os.path.isdir(dest): os.makedirs(dest) files = os.listdir(source) for f in files: if f not in os.listdir(dest): shutil.move(source + f, dest) file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Bad files moved to archive") path = 'Training_Raw_files_validated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!") file.close() except Exception as e: file = open("Training_Logs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise e def validationFileNameRaw(self, regex, LengthOfDateStampInFile, LengthOfTimeStampInFile): """ Method Name: validationFileNameRaw Description: This function validates the name of the training csv files as per given name in the schema! Regex pattern is used to do the validation.If name format do not match the file is moved to Bad Raw Data folder else in Good raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ # pattern = "['Wafer']+['\_'']+[\d_]+[\d]+\.csv" # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted. self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() # create new directories self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in listdir(self.Batch_Directory)] try: #my_file = self.rootProjPath+'\\Training_Logs\\nameValidationLog.txt' #f = open(my_file, 'a+') f = open("Training_Logs/nameValidationLog.txt", 'a+') for filename in onlyfiles: #my_file = self.rootProjPath+'\\Training_Batch_Files\\' if (re.match(regex, filename)): splitAtDot = re.split('.csv', filename) splitAtDot = (re.split('_', splitAtDot[0])) if len(splitAtDot[1]) == LengthOfDateStampInFile: if len(splitAtDot[2]) == LengthOfTimeStampInFile: #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Good_Raw") shutil.copy( "Training_Batch_Files/" + filename, "Training_Raw_files_validated/Good_Raw") self.logger.log( f, "Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw") shutil.copy( "Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: #shutil.copy(my_file+ filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw") shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: #shutil.copy(my_file + filename, rootProjPath+"\\Training_Raw_files_validated\\Bad_Raw") shutil.copy("Training_Batch_Files/" + filename, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) f.close() except Exception as e: #my_file = self.rootProjPath+'\\Training_Logs\\nameValidationLog.txt' #f = open(my_file, 'a+') f = open("Training_Logs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberofColumns): """ Method Name: validateColumnLength Description: This function validates the number of columns in the csv files. It is should be same as given in the schema file. If not same file is not suitable for processing and thus is moved to Bad Raw Data folder. If the column number matches, file is kept in Good Raw Data for processing. The csv file is missing the first column name, this function changes the missing name to "Wafer". Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt' #f = open(my_file, 'a+') f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Column Length Validation Started!!") #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\' #for file in listdir(my_file): for file in listdir('Training_Raw_files_validated/Good_Raw/'): #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\' #csv = pd.read_csv(my_file+file,sep=';') csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file, sep=';') if csv.shape[1] == NumberofColumns: pass else: #my_file = self.rootProjPath+'\\Training_Raw_files_validated\\' #shutil.move(my_file+"Good_Raw\\" + file, my_file+"Bad_Raw") shutil.move( "Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(f, "Column Length Validation Completed!!") except OSError: #THIS_FOLDER = os.path.dirname(os.path.abspath(__file__)) #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt' #f = open(my_file, 'a+') f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: #my_file = self.rootProjPath+'\\Training_Logs\\columnValidationLog.txt' #f = open(my_file, 'a+') f = open("Training_Logs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close() def validateMissingValuesInWholeColumn(self, rootProjPath): """ Method Name: validateMissingValuesInWholeColumn Description: This function validates if any column in the csv file has all values missing. If all the values are missing, the file is not suitable for processing. SUch files are moved to bad raw data. Output: None On Failure: Exception Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt' #f = open(my_file, 'a+') f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Missing Values Validation Started!!") #my_file =self.rootProjPath+'\\Training_Raw_files_validated\\Good_Raw\\' #my_file2 = self.rootProjPath+'\\Training_Raw_files_validated\\Bad_Raw' for file in listdir('Training_Raw_files_validated/Good_Raw/'): #for file in listdir(my_file): #csv = pd.read_csv(my_file + file) csv = pd.read_csv("Training_Raw_files_validated/Good_Raw/" + file) count = 0 for columns in csv: if (len(csv[columns]) - csv[columns].count()) == len( csv[columns]): count += 1 #shutil.move(my_file + file,my_file2) shutil.move( "Training_Raw_files_validated/Good_Raw/" + file, "Training_Raw_files_validated/Bad_Raw") self.logger.log( f, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) break if count == 0: #csv.rename(columns={"Unnamed: 0": "Wafer"}, inplace=True) #csv.to_csv(my_file + file, index=None, header=True) csv.to_csv("Training_Raw_files_validated/Good_Raw/" + file, index=None, header=True) except OSError: #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt' #f = open(my_file, 'a+') f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log( f, "Error Occured while moving the file :: %s" % OSError) f.close() raise OSError except Exception as e: #my_file = self.rootProjPath+'\\Training_Logs\\missingValuesInColumn.txt' #f = open(my_file, 'a+') f = open("Training_Logs/missingValuesInColumn.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e f.close()
def preprocess_and_split(config_path): file_object = open('Training_log.txt', 'a+') logger = App_Logger() config = read_params(config_path) train_data_path = config["split_data"]["train_path"] raw_train_data_path = config["load_data"]["raw_train_data_csv"] logger.log(file_object, "Training Data load was successful") train_df = pd.read_csv(raw_train_data_path, nrows=100000) logger.log(file_object, "Data reading successful") # 1.Function for extracting features from date column train_df = date_process( train_df) # function for datetime cols processing in train data logger.log(file_object, "Datetime Processing in train data completed ") # 2. Function to validate the columns in the dataset for json datatype train_json_columns = column_validator( train_df ) # Validating the columns in the train dataset for json datatype logger.log(file_object, "Column_validator successful") # 2.1 Function for flattening the json columns and merge them with original dataset if train_json_columns is not None: train_df = json_to_df( train_df, train_json_columns) #Normalizing the json columns in train data target = train_df['transactionRevenue'] logger.log(file_object, "Normalizing the json columns completed") # 3.Dropping columns which have more than 50% of null values and columns not contributing to the target variable train_df = remove_nan_cols(train_df) logger.log(file_object, "50% NAN value columns are removed") train_df.drop( 'sessionId', axis=1, inplace=True ) # Removing this column as it is the combination of fullVisitorId and visitId train_df.drop( 'visitStartTime', axis=1, inplace=True) # Removing this column as it is extracted into visitHour train_df.drop( 'fullVisitorId', axis=1, inplace=True ) # This column is very long and of no much contribution towards target variable #drop_columns = ['visitId', 'weekday', 'day', 'bounces', 'keyword'] drop_columns = ['visitId', 'weekday', 'day'] train_df.drop(drop_columns, axis=1, inplace=True) logger.log( file_object, 'Dropped columns which are not contributing to the transaction revenue' ) # 4.Imputation of null values train_df = pd.concat( [train_df, target], axis=1 ) # transactionRevenue col is attached to the dataframe for imputing nan with 0 train_df = impute_na(train_df) logger.log(file_object, "Imputing NAN values with 0 is completed") # 5.Changing datatypes from object to desired ones train_df = data_type_convert(train_df) logger.log(file_object, "Conversion of Datatype to int completed") # 6. Removing columns with constant values or with zero standard deviation train_df = remove_zero_std_cols(train_df) logger.log(file_object, "Zero standard deviation columns are removed") # # 7 Function to gather categorical columns in the dataset and performing label encoding label_cols = categorical_cols(train_df) logger.log(file_object, "Gathering of label _cols in train data completed ") train_df = label_encoding(train_df, label_cols) logger.log(file_object, "Label_encoding in train data completed ") # 8. Imputing pageviews column with KNNImputer in train data from sklearn.impute import KNNImputer imputer = KNNImputer() imputer_train_df = imputer.fit_transform(train_df[[ 'pageviews' ]]) ## Imputing pageviews with KNNimputer in training data train_df['pageviews'] = imputer_train_df logger.log(file_object, "Pageviews column imputed with KNNimputer") train_df.to_csv(train_data_path, sep=",", index=False, encoding="utf-8") ## Storing Processed train data logger.log( file_object, "Traning data is processed and stored as data/processed/train_processed.csv" ) file_object.close()
def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger()
class prediction: def __init__(self, path): self.file_object = open("Prediction_Logs/Prediction_Log.txt", 'a+') self.log_writer = App_Logger() self.pred_data_val = Prediction_Data_Validation(path) def predictionFromModel(self): try: self.pred_data_val.deletePredictionFile( ) #deletes the existing prediction file from last run! self.log_writer.log(self.file_object, 'Start of Prediction') data_getter = data_loader_prediction.Data_Getter_Pred( self.file_object, self.log_writer) data = data_getter.get_data() preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer) is_null_present, cols_with_missing_values = preprocessor.is_null_present( data) if (is_null_present): data = preprocessor.impute_missing_values(data) data = preprocessor.logTransformation(data) # print(data) # #scale the prediction data data_scaled = pandas.DataFrame( preprocessor.standardScalingData(data), columns=data.columns) # #data=data.to_numpy() file_loader = file_methods.File_Operation(self.file_object, self.log_writer) kmeans = file_loader.load_model('KMeans') clusters = kmeans.predict( data_scaled) #drops the first column for cluster prediction data_scaled['clusters'] = clusters clusters = data_scaled['clusters'].unique() result = [] # initialize blank list for storing predicitons for i in clusters: cluster_data = data_scaled[data_scaled['clusters'] == i] cluster_data = cluster_data.drop(['clusters'], axis=1) model_name = file_loader.find_correct_model_file(i) model = file_loader.load_model(model_name) for val in (model.predict(cluster_data.values)): result.append(val) result = pandas.DataFrame(result, columns=['Predictions']) path = "Prediction_Output_File/Predictions.csv" result.to_csv("Prediction_Output_File/Predictions.csv", header=True) #appends result to prediction file self.log_writer.log(self.file_object, 'End of Prediction') except Exception as ex: self.log_writer.log( self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex) raise ex return path