class dataTransformPredict: def __init__(self): self.goodDataPath = "PredictionRawFilesValidated/GoodRaw" self.logger = AppLogger() def replaceMissingValueWithNull(self, data=None): log_file = open("PredictionLogs/dataTransformLog.txt", 'a+') try: if data is None: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) csv['Comments'] = csv['Comments'].str[6:] csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log( log_file, " %s: File Transformed successfully!!" % file) else: data = data.fillna('Null', inplace=True) self.logger.log( log_file, " %s: File Transformed successfully!!" % log_file) return data except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() log_file.close()
def __init__(self): self.goodDataPath = "PredictionRawFilesValidated/GoodRaw" self.logger = AppLogger()
def __init__(self): self.goodDataPath = "TrainingRawFilesValidated/GoodRaw" self.logger = AppLogger()
class TrainValidation: def __init__(self, path): self.rawData = RawDataValidation(path) self.dataTransform = dataTransform() self.dBOperation = dBOperation() self.file_object = open("TrainingLogs/TrainingMainLog.txt", "a+") self.log_writer = AppLogger() def trainValidation(self): try: self.log_writer.log(self.file_object, "Start Validation on files!!!") LengthOfDataTimestamp, ColNames, NumberOfColumns = self.rawData.valuesFromSchema( ) regex = self.rawData.manualRegEx() self.rawData.validateFileNameRaw(regex, LengthOfDataTimestamp) self.rawData.validateColumnLength(NumberOfColumns) self.log_writer.log(self.file_object, "Raw Data Validation Complete!!") self.log_writer.log(self.file_object, "Starting Data Transforamtion!!") self.dataTransform.replaceMissingValueWithNull(None) self.log_writer.log(self.file_object, "DataTransformation Completed!!!") self.log_writer.log( self.file_object, "Creating Training_Database and tables on the basis of given schema!!!" ) self.dBOperation.createTableDb("Training") self.log_writer.log(self.file_object, "Table creation Completed!!") self.log_writer.log(self.file_object, "Insertion of Data into Table started!!!!") self.dBOperation.insertIntoTableGoodData("Training") self.log_writer.log(self.file_object, "Insertion in Table completed!!!") self.log_writer.log(self.file_object, "Deleting Good Data Folder!!!") self.rawData.deleteExistingGoodDataTrainingFolder() self.log_writer.log(self.file_object, "Good_Data folder deleted!!!") self.log_writer.log( self.file_object, "Moving bad files to Archive and deleting Bad_Data folder!!!") self.rawData.moveBadFilesToArchivedBad() self.log_writer.log( self.file_object, "Bad files moved to archive!! Bad folder Deleted!!") self.log_writer.log(self.file_object, "Validation Operation completed!!") self.log_writer.log(self.file_object, "Extracting csv file from table") self.dBOperation.selectingDatafromtableintocsv("Training") self.file_object.close() except Exception as e: raise e
def __init__(self, path): self.rawData = RawDataValidation(path) self.dataTransform = dataTransform() self.dBOperation = dBOperation() self.file_object = open("TrainingLogs/TrainingMainLog.txt", "a+") self.log_writer = AppLogger()
def __init__(self, path): self.path = path self.schemaPath = 'schema_training.json' self.logger = AppLogger()
class RawDataValidation: def __init__(self, path): self.path = path self.schemaPath = 'schema_training.json' self.logger = AppLogger() def valuesFromSchema(self): try: with open(self.schemaPath, 'r') as f: dic = json.load(f) f.close() pattern = dic['SampleFileName'] LengthOfDateTimeStampInFile = dic['LengthOFDateStampInFile'] ColNames = dic['ColName'] NumberOfColumn = dic['NumberOfColumn'] file = open("TrainingLogs/valuesFromSchemaValidationLog.txt", "a+") message ="LengthOfDateTimeStampInFile:: %s" %LengthOfDateTimeStampInFile +"\t " + "NumberofColumns:: %s" %NumberOfColumn + "\n" self.logger.log(file,message) file.close() except ValueError: file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file,"ValueError:Value not found inside schema_training.json") file.close() raise ValueError except KeyError: file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, "KeyError:Key value error incorrect key passed") file.close() raise KeyError except Exception as e: file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+') self.logger.log(file, str(e)) file.close() raise e return LengthOfDateTimeStampInFile, ColNames, NumberOfColumn def manualRegEx(self): return "['Review']+['\_'']+[\d]+\.csv" def createDirectoryForGoodBadRawData(self): try: path = os.path.join("TrainingRawfilesValidated/", "GoodRaw/") if not os.path.exists(path): os.makedirs(path) path = os.path.join("TrainingRawfilesValidated/","BadRaw/") if not os.path.exists(path): os.makedirs(path) except OSError as ex: file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"Error while creating Directory %s:" % ex) file.close() raise ex def deleteExistingGoodDataTrainingFolder(self): try: path = "TrainingRawfilesValidated/" if os.path.exists(path + "GoodRaw/"): shutil.rmtree(path + "GoodRaw/") file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"GoodRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"Error while Deleting Directory : %s" %s) file.close() raise s def deleteExistingBadDataTrainingFolder(self): try: path = "TrainingRawfilesValidated/" if os.path.exists(path + "BadRaw/"): shutil.rmtree(path + "BadRaw/") file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"BadRaw directory deleted successfully!!!") file.close() except OSError as s: file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"Error while Deleting Directory : %s" %s) file.close() raise OSError def moveBadFilesToArchivedBad(self): now = datetime.now() date = now.date() time = now.strftime("%H%M%S") try: source = "TrainingRawfilesValidated/BadRaw/" if os.path.isdir(source): path = "TrainingArchivedBadData" if not os.path.exists(path): os.makedirs(path) destPath = "TrainingArchivedBadData/BadData_" + str(date) + "_" + str(time) if not os.path.exists(destPath): os.makedirs(destPath) files = os.listdir(source) for file in files: if file not in os.listdir(destPath): shutil.move(source + file, destPath) file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file,"Bad files moved to archive") path = 'TrainingRawfilesValidated/' if os.path.isdir(path + 'Bad_Raw/'): shutil.rmtree(path + 'Bad_Raw/') self.logger.log(file,"Bad Raw Data Folder Deleted successfully!!") file.close() except Exception as e: file = open("TrainingLogs/GeneralLog.txt", 'a+') self.logger.log(file, "Error while moving bad files to archive:: %s" % e) file.close() raise e def validateFileNameRaw(self, regex, LengthOfDateTimeStampInFile): self.deleteExistingBadDataTrainingFolder() self.deleteExistingGoodDataTrainingFolder() self.createDirectoryForGoodBadRawData() onlyfiles = [f for f in listdir(self.path)] try: file = open("TrainingLogs/nameValidationLog.txt", "a+") for filename in onlyfiles: if (re.match(regex, filename)): splitAtDot = re.split(".csv", filename) splitAtDot = re.split("_", splitAtDot[0]) if len(splitAtDot[1]) == LengthOfDateTimeStampInFile: shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/GoodRaw") self.logger.log(file,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename) else: shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/BadRaw") self.logger.log(file,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) else: shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/BadRaw") self.logger.log(file,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename) file.close() except Exception as e: f = open("TrainingLogs/nameValidationLog.txt", 'a+') self.logger.log(f, "Error occured while validating FileName %s" % e) f.close() raise e def validateColumnLength(self, NumberOfColumn): try: file = open("TrainingLogs/columnValidationLog.txt", "a+") self.logger.log(file,"Column Length Validation Started!!") for filename in listdir("TrainingRawfilesValidated/GoodRaw"): csv = pd.read_csv("TrainingRawfilesValidated/GoodRaw/" + filename) if csv.shape[1] > 2: csv = csv.drop(['Unnamed: 2'], axis=1) if csv.shape[1] == NumberOfColumn: pass else: print(csv.columns) shutil.move("TrainingRawfilesValidated/GoodRaw/" + filename, "TrainingRawFilesValidated/BadRaw") self.logger.log(file, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file) self.logger.log(file, "Column Length Validation Completed!!") except OSError as ex: f = open("TrainingLogs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured while moving the file :: %s" % OSError) f.close() raise ex except Exception as e: f = open("TrainingLogs/columnValidationLog.txt", 'a+') self.logger.log(f, "Error Occured:: %s" % e) f.close() raise e file.close()
def __init__(self, path, file): self.rawData = PredictionRawDataValidation(path, file) self.file = file self.dataTransform = dataTransformPredict() self.file_object = open("PredictionLogs/PredictionMainLog.txt", "a+") self.log_writer = AppLogger()
class PredictionValidation: def __init__(self, path, file): self.rawData = PredictionRawDataValidation(path, file) self.file = file self.dataTransform = dataTransformPredict() self.file_object = open("PredictionLogs/PredictionMainLog.txt", "a+") self.log_writer = AppLogger() def predictionValidation(self): try: self.log_writer.log(self.file_object, "Start Validation on files!!!") LengthOfDataTimestamp, ColNames, NumberOfColumns = self.rawData.valuesFromSchema() regex = self.rawData.manualRegex() self.rawData.validateFileNameRaw(regex, LengthOfDataTimestamp) self.rawData.validateColumnLength(NumberOfColumns) self.log_writer.log(self.file_object, "Raw Data Validation Complete!!") self.log_writer.log(self.file_object, "Starting Data Transforamtion!!") self.dataTransform.replaceMissingValueWithNull(None) self.log_writer.log(self.file_object, "DataTransformation Completed!!!") self.log_writer.log(self.file_object, "Creating PredictionDatabase and tables on the basis of given schema!!!") self.log_writer.log(self.file_object, "Moving bad files to Archive and deleting BadData folder!!!") self.rawData.moveBadFilesToArchiveBad() self.log_writer.log(self.file_object, "Bad files moved to archive!! Bad folder Deleted!!") self.log_writer.log(self.file_object, "Validation Operation completed!!") self.file_object.close() except Exception as e: raise e
def __init__(self): self.badFilePath = "TrainingRawfilesValidated/BadRaw/" self.goodFilePath = "TrainingRawfilesValidated/GoodRaw/" self.client = pymongo.MongoClient("mongodb://127.0.0.1:27017") self.logger = AppLogger()
class dBOperation: def __init__(self): self.badFilePath = "TrainingRawfilesValidated/BadRaw/" self.goodFilePath = "TrainingRawfilesValidated/GoodRaw/" self.client = pymongo.MongoClient("mongodb://127.0.0.1:27017") self.logger = AppLogger() def dataBaseConnection(self, DatabaseName): try: conn = self.client[DatabaseName] file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Error while connecting to database: %s" % ConnectionError) file.close() raise ConnectionError return conn def createTableDb(self, DatabaseName): try: conn = self.dataBaseConnection(DatabaseName) collectionList = conn.collection_names() if "GoodRawData" in collectionList: allData = conn.GoodRawData.find({}, { "_id": 0, "Comments": 1, "Ratings": 1 }) if allData.count() == 0: self.client.close() file = open("TrainingLogs/DataBaseCollectionCreateLog.txt", 'a+') self.logger.log(file, "Collection Already Exists!!") file.close() file = open("TrainingLogs/DataBaseInsertLog.txt", 'a+') self.logger.log(file, "No Data Found!!") file.close() file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Closed %s database successfully" % DatabaseName) file.close() return conn.GoodRawData.remove() self.client.close() file = open("TrainingLogs/DataBaseCollectionCreateLog.txt", 'a+') self.logger.log(file, "Collection Already Exists!!") file.close() file = open("TrainingLogs/DataBaseInsertLog.txt", 'a+') self.logger.log(file, "Data Deletion Successfully!!") file.close() file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Closed %s database successfully" % DatabaseName) file.close() else: conn.create_collection("GoodRawData") self.client.close() file = open("TrainingLogs/DataBaseCollectionCreateLog.txt", 'a+') self.logger.log(file, "Collection created successfully!!") file.close() file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log( file, "Closed %s database successfully" % DatabaseName) file.close() except Exception as e: file = open("TrainingLogs/DataBaseCollectionCreateLog.txt", 'a+') self.logger.log(file, "Error while creating table: %s " % e) file.close() self.client.close() file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() raise e def insertIntoTableGoodData(self, Database): conn = self.dataBaseConnection(Database) logFile = open("TrainingLogs/DataBaseInsertLog.txt", 'a+') onlyfiles = [file for file in os.listdir(self.goodFilePath)] if onlyfiles: for f in onlyfiles: try: data = pd.read_csv(os.path.join(self.goodFilePath, f)) document = [{ 'Comments': rating, 'Ratings': label } for rating, label in zip( data['Comments'], data['Ratings'])] conn.GoodRawData.insert_many(document) except Exception as e: self.logger.log( logFile, "Insertion in Collection Failed. Error: %s" % e) self.logger.log(logFile, "Insertion in Collection Successfully.") logFile.close() else: print("No files") self.client.close() self.logger.log(logFile, "Insertion in Collection Successfully.") logFile.close() def selectingDatafromtableintocsv(self, Database): self.fileFromDb = 'TrainingFileFromDB/' self.fileName = 'InputFile.csv' log_file = open("TrainingLogs/ExportToCsv.txt", 'a+') conn = self.dataBaseConnection(Database) try: conn = self.dataBaseConnection(Database) if conn.GoodRawData.count() == 0: self.logger.log(log_file, 'No Record in GoodRawData Collection') log_file.close() return if not os.path.isdir(self.fileFromDb): os.makedirs(self.fileFromDb) data = list() for row in conn.GoodRawData.find({}, { "_id": 0, "Comments": 1, "Ratings": 1 }): data.append({ 'Comments': row['Comments'], 'Ratings': row['Ratings'] }) dataframe = pd.DataFrame(data, columns=['Comments', 'Ratings']) dataframe.to_csv(os.path.join(self.fileFromDb, self.fileName), index=None) self.logger.log(log_file, 'CSV File Exported Successfully !!!') self.logger.log( log_file, 'Successfully Executed selectingDatafromtableintocsv method of dbOperation class of dbOperation package' ) log_file.close() except Exception as e: self.logger.log(log_file, "File exporting failed. Error : %s" % e) log_file.close()
def __init__(self, path, file): self.path = path self.file = file self.schema_path = 'schema_prediction.json' self.logger = AppLogger()