class dataTransformPredict:
    def __init__(self):
        self.goodDataPath = "PredictionRawFilesValidated/GoodRaw"
        self.logger = AppLogger()

    def replaceMissingValueWithNull(self, data=None):

        log_file = open("PredictionLogs/dataTransformLog.txt", 'a+')
        try:
            if data is None:
                onlyfiles = [f for f in listdir(self.goodDataPath)]
                for file in onlyfiles:
                    csv = pandas.read_csv(self.goodDataPath + "/" + file)
                    csv.fillna('NULL', inplace=True)
                    csv['Comments'] = csv['Comments'].str[6:]
                    csv.to_csv(self.goodDataPath + "/" + file,
                               index=None,
                               header=True)
                    self.logger.log(
                        log_file,
                        " %s: File Transformed successfully!!" % file)
            else:
                data = data.fillna('Null', inplace=True)
                self.logger.log(
                    log_file,
                    " %s: File Transformed successfully!!" % log_file)
                return data
        except Exception as e:
            self.logger.log(log_file,
                            "Data Transformation failed because:: %s" % e)
            log_file.close()
        log_file.close()
 def __init__(self):
     self.goodDataPath = "PredictionRawFilesValidated/GoodRaw"
     self.logger = AppLogger()
Пример #3
0
 def __init__(self):
     self.goodDataPath = "TrainingRawFilesValidated/GoodRaw"
     self.logger = AppLogger()
class TrainValidation:
    def __init__(self, path):
        self.rawData = RawDataValidation(path)
        self.dataTransform = dataTransform()
        self.dBOperation = dBOperation()
        self.file_object = open("TrainingLogs/TrainingMainLog.txt", "a+")
        self.log_writer = AppLogger()

    def trainValidation(self):
        try:
            self.log_writer.log(self.file_object,
                                "Start Validation on files!!!")

            LengthOfDataTimestamp, ColNames, NumberOfColumns = self.rawData.valuesFromSchema(
            )
            regex = self.rawData.manualRegEx()

            self.rawData.validateFileNameRaw(regex, LengthOfDataTimestamp)
            self.rawData.validateColumnLength(NumberOfColumns)

            self.log_writer.log(self.file_object,
                                "Raw Data Validation Complete!!")
            self.log_writer.log(self.file_object,
                                "Starting Data Transforamtion!!")

            self.dataTransform.replaceMissingValueWithNull(None)

            self.log_writer.log(self.file_object,
                                "DataTransformation Completed!!!")
            self.log_writer.log(
                self.file_object,
                "Creating Training_Database and tables on the basis of given schema!!!"
            )

            self.dBOperation.createTableDb("Training")
            self.log_writer.log(self.file_object, "Table creation Completed!!")
            self.log_writer.log(self.file_object,
                                "Insertion of Data into Table started!!!!")

            self.dBOperation.insertIntoTableGoodData("Training")
            self.log_writer.log(self.file_object,
                                "Insertion in Table completed!!!")
            self.log_writer.log(self.file_object,
                                "Deleting Good Data Folder!!!")

            self.rawData.deleteExistingGoodDataTrainingFolder()
            self.log_writer.log(self.file_object,
                                "Good_Data folder deleted!!!")
            self.log_writer.log(
                self.file_object,
                "Moving bad files to Archive and deleting Bad_Data folder!!!")

            self.rawData.moveBadFilesToArchivedBad()
            self.log_writer.log(
                self.file_object,
                "Bad files moved to archive!! Bad folder Deleted!!")
            self.log_writer.log(self.file_object,
                                "Validation Operation completed!!")
            self.log_writer.log(self.file_object,
                                "Extracting csv file from table")

            self.dBOperation.selectingDatafromtableintocsv("Training")
            self.file_object.close()

        except Exception as e:
            raise e
 def __init__(self, path):
     self.rawData = RawDataValidation(path)
     self.dataTransform = dataTransform()
     self.dBOperation = dBOperation()
     self.file_object = open("TrainingLogs/TrainingMainLog.txt", "a+")
     self.log_writer = AppLogger()
Пример #6
0
 def __init__(self, path):
     self.path = path
     self.schemaPath = 'schema_training.json'
     self.logger = AppLogger()
Пример #7
0
class RawDataValidation:

    def __init__(self, path):
        self.path = path
        self.schemaPath = 'schema_training.json'
        self.logger = AppLogger()

    def valuesFromSchema(self):
        try:
            with open(self.schemaPath, 'r') as f:
                dic = json.load(f)
                f.close()
            pattern = dic['SampleFileName']
            LengthOfDateTimeStampInFile = dic['LengthOFDateStampInFile']
            ColNames = dic['ColName']
            NumberOfColumn = dic['NumberOfColumn']

            file = open("TrainingLogs/valuesFromSchemaValidationLog.txt", "a+")
            message ="LengthOfDateTimeStampInFile:: %s" %LengthOfDateTimeStampInFile +"\t " + "NumberofColumns:: %s" %NumberOfColumn + "\n"
            self.logger.log(file,message)

            file.close()

        except ValueError:
            file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log(file,"ValueError:Value not found inside schema_training.json")
            file.close()
            raise ValueError

        except KeyError:
            file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log(file, "KeyError:Key value error incorrect key passed")
            file.close()
            raise KeyError

        except Exception as e:
            file = open("TrainingLogs/valuesfromSchemaValidationLog.txt", 'a+')
            self.logger.log(file, str(e))
            file.close()
            raise e

        return LengthOfDateTimeStampInFile, ColNames, NumberOfColumn
    
    def manualRegEx(self):
        return "['Review']+['\_'']+[\d]+\.csv"

    def createDirectoryForGoodBadRawData(self):
        try:
            path = os.path.join("TrainingRawfilesValidated/", "GoodRaw/")
            if not os.path.exists(path):
                os.makedirs(path)
            path = os.path.join("TrainingRawfilesValidated/","BadRaw/")
            if not os.path.exists(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("TrainingLogs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while creating Directory %s:" % ex)
            file.close()
            raise ex

    def deleteExistingGoodDataTrainingFolder(self):
        try:
            path = "TrainingRawfilesValidated/"
            if os.path.exists(path + "GoodRaw/"):
                shutil.rmtree(path + "GoodRaw/")
                file = open("TrainingLogs/GeneralLog.txt", 'a+')
                self.logger.log(file,"GoodRaw directory deleted successfully!!!")
                file.close()

        except OSError as s:
            file = open("TrainingLogs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while Deleting Directory : %s" %s)
            file.close()
            raise s

    def deleteExistingBadDataTrainingFolder(self):
        try:
            path = "TrainingRawfilesValidated/"
            if os.path.exists(path + "BadRaw/"):
                shutil.rmtree(path + "BadRaw/")
                file = open("TrainingLogs/GeneralLog.txt", 'a+')
                self.logger.log(file,"BadRaw directory deleted successfully!!!")
                file.close()

        except OSError as s:
            file = open("TrainingLogs/GeneralLog.txt", 'a+')
            self.logger.log(file,"Error while Deleting Directory : %s" %s)
            file.close()
            raise OSError

    def moveBadFilesToArchivedBad(self):

        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")

        try:
            source = "TrainingRawfilesValidated/BadRaw/"

            if os.path.isdir(source):
                path = "TrainingArchivedBadData"
                if not os.path.exists(path):
                    os.makedirs(path)
                
                destPath = "TrainingArchivedBadData/BadData_" + str(date) + "_" + str(time)
                if not os.path.exists(destPath):
                    os.makedirs(destPath)
                
                files = os.listdir(source)
                for file in files:
                    if file not in os.listdir(destPath):
                        shutil.move(source + file, destPath)

                file = open("TrainingLogs/GeneralLog.txt", 'a+')
                self.logger.log(file,"Bad files moved to archive")
                path = 'TrainingRawfilesValidated/'
                if os.path.isdir(path + 'Bad_Raw/'):
                    shutil.rmtree(path + 'Bad_Raw/')
                self.logger.log(file,"Bad Raw Data Folder Deleted successfully!!")
                file.close()

        except Exception as e:
            file = open("TrainingLogs/GeneralLog.txt", 'a+')
            self.logger.log(file, "Error while moving bad files to archive:: %s" % e)
            file.close()
            raise e

    def validateFileNameRaw(self, regex, LengthOfDateTimeStampInFile):

        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        self.createDirectoryForGoodBadRawData()

        onlyfiles = [f for f in listdir(self.path)]
        try:
            file = open("TrainingLogs/nameValidationLog.txt", "a+")
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split(".csv", filename)
                    splitAtDot = re.split("_", splitAtDot[0])
                    if len(splitAtDot[1]) == LengthOfDateTimeStampInFile:
                        shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/GoodRaw")
                        self.logger.log(file,"Valid File name!! File moved to GoodRaw Folder :: %s" % filename)
                    else:
                        shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/BadRaw")
                        self.logger.log(file,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
                else:
                    shutil.copy("TrainingBatchFiles/" + filename, "TrainingRawfilesValidated/BadRaw")
                    self.logger.log(file,"Invalid File Name!! File moved to Bad Raw Folder :: %s" % filename)
            
            file.close()

        except Exception as e:
            f = open("TrainingLogs/nameValidationLog.txt", 'a+')
            self.logger.log(f, "Error occured while validating FileName %s" % e)
            f.close()
            raise e

    def validateColumnLength(self, NumberOfColumn):
        try:
            file = open("TrainingLogs/columnValidationLog.txt", "a+")
            self.logger.log(file,"Column Length Validation Started!!")
            for filename in listdir("TrainingRawfilesValidated/GoodRaw"):
                csv = pd.read_csv("TrainingRawfilesValidated/GoodRaw/" + filename)
                if csv.shape[1] > 2:
                    csv = csv.drop(['Unnamed: 2'], axis=1)
                if csv.shape[1] == NumberOfColumn:
                    pass
                else:
                    print(csv.columns)
                    shutil.move("TrainingRawfilesValidated/GoodRaw/" + filename, "TrainingRawFilesValidated/BadRaw")
                    self.logger.log(file, "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s" % file)
            self.logger.log(file, "Column Length Validation Completed!!")

        except OSError as ex:
            f = open("TrainingLogs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured while moving the file :: %s" % OSError)
            f.close()
            raise ex

        except Exception as e:
            f = open("TrainingLogs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e

        file.close()
Пример #8
0
 def __init__(self, path, file):
     self.rawData = PredictionRawDataValidation(path, file)
     self.file = file
     self.dataTransform = dataTransformPredict()
     self.file_object = open("PredictionLogs/PredictionMainLog.txt", "a+")
     self.log_writer = AppLogger()
Пример #9
0
class PredictionValidation:

    def __init__(self, path, file):
        self.rawData = PredictionRawDataValidation(path, file)
        self.file = file
        self.dataTransform = dataTransformPredict()
        self.file_object = open("PredictionLogs/PredictionMainLog.txt", "a+")
        self.log_writer = AppLogger()

    def predictionValidation(self):
        try:
            self.log_writer.log(self.file_object, "Start Validation on files!!!")

            LengthOfDataTimestamp, ColNames, NumberOfColumns = self.rawData.valuesFromSchema()
            regex = self.rawData.manualRegex()

            self.rawData.validateFileNameRaw(regex, LengthOfDataTimestamp)
            self.rawData.validateColumnLength(NumberOfColumns)

            self.log_writer.log(self.file_object, "Raw Data Validation Complete!!")
            self.log_writer.log(self.file_object, "Starting Data Transforamtion!!")

            self.dataTransform.replaceMissingValueWithNull(None)

            self.log_writer.log(self.file_object, "DataTransformation Completed!!!")
            self.log_writer.log(self.file_object,
                                "Creating PredictionDatabase and tables on the basis of given schema!!!")

            self.log_writer.log(self.file_object, "Moving bad files to Archive and deleting BadData folder!!!")

            self.rawData.moveBadFilesToArchiveBad()
            self.log_writer.log(self.file_object, "Bad files moved to archive!! Bad folder Deleted!!")
            self.log_writer.log(self.file_object, "Validation Operation completed!!")

            self.file_object.close()

        except Exception as e:
            raise e
 def __init__(self):
     self.badFilePath = "TrainingRawfilesValidated/BadRaw/"
     self.goodFilePath = "TrainingRawfilesValidated/GoodRaw/"
     self.client = pymongo.MongoClient("mongodb://127.0.0.1:27017")
     self.logger = AppLogger()
class dBOperation:
    def __init__(self):
        self.badFilePath = "TrainingRawfilesValidated/BadRaw/"
        self.goodFilePath = "TrainingRawfilesValidated/GoodRaw/"
        self.client = pymongo.MongoClient("mongodb://127.0.0.1:27017")
        self.logger = AppLogger()

    def dataBaseConnection(self, DatabaseName):

        try:
            conn = self.client[DatabaseName]
            file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Opened %s database successfully" % DatabaseName)
            file.close()

        except ConnectionError:
            file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(
                file,
                "Error while connecting to database: %s" % ConnectionError)
            file.close()
            raise ConnectionError

        return conn

    def createTableDb(self, DatabaseName):

        try:
            conn = self.dataBaseConnection(DatabaseName)
            collectionList = conn.collection_names()

            if "GoodRawData" in collectionList:

                allData = conn.GoodRawData.find({}, {
                    "_id": 0,
                    "Comments": 1,
                    "Ratings": 1
                })
                if allData.count() == 0:
                    self.client.close()

                    file = open("TrainingLogs/DataBaseCollectionCreateLog.txt",
                                'a+')
                    self.logger.log(file, "Collection Already Exists!!")
                    file.close()

                    file = open("TrainingLogs/DataBaseInsertLog.txt", 'a+')
                    self.logger.log(file, "No Data Found!!")
                    file.close()

                    file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
                    self.logger.log(
                        file, "Closed %s database successfully" % DatabaseName)
                    file.close()

                    return

                conn.GoodRawData.remove()
                self.client.close()

                file = open("TrainingLogs/DataBaseCollectionCreateLog.txt",
                            'a+')
                self.logger.log(file, "Collection Already Exists!!")
                file.close()

                file = open("TrainingLogs/DataBaseInsertLog.txt", 'a+')
                self.logger.log(file, "Data Deletion Successfully!!")
                file.close()

                file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, "Closed %s database successfully" % DatabaseName)
                file.close()

            else:
                conn.create_collection("GoodRawData")

                self.client.close()

                file = open("TrainingLogs/DataBaseCollectionCreateLog.txt",
                            'a+')
                self.logger.log(file, "Collection created successfully!!")
                file.close()

                file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
                self.logger.log(
                    file, "Closed %s database successfully" % DatabaseName)
                file.close()

        except Exception as e:
            file = open("TrainingLogs/DataBaseCollectionCreateLog.txt", 'a+')
            self.logger.log(file, "Error while creating table: %s " % e)
            file.close()

            self.client.close()

            file = open("TrainingLogs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            file.close()
            raise e

    def insertIntoTableGoodData(self, Database):

        conn = self.dataBaseConnection(Database)
        logFile = open("TrainingLogs/DataBaseInsertLog.txt", 'a+')

        onlyfiles = [file for file in os.listdir(self.goodFilePath)]
        if onlyfiles:
            for f in onlyfiles:
                try:
                    data = pd.read_csv(os.path.join(self.goodFilePath, f))
                    document = [{
                        'Comments': rating,
                        'Ratings': label
                    }
                                for rating, label in zip(
                                    data['Comments'], data['Ratings'])]
                    conn.GoodRawData.insert_many(document)
                except Exception as e:
                    self.logger.log(
                        logFile,
                        "Insertion in Collection Failed. Error: %s" % e)
                    self.logger.log(logFile,
                                    "Insertion in Collection Successfully.")
                    logFile.close()
        else:
            print("No files")
        self.client.close()

        self.logger.log(logFile, "Insertion in Collection Successfully.")
        logFile.close()

    def selectingDatafromtableintocsv(self, Database):

        self.fileFromDb = 'TrainingFileFromDB/'
        self.fileName = 'InputFile.csv'
        log_file = open("TrainingLogs/ExportToCsv.txt", 'a+')
        conn = self.dataBaseConnection(Database)

        try:
            conn = self.dataBaseConnection(Database)
            if conn.GoodRawData.count() == 0:
                self.logger.log(log_file,
                                'No Record in GoodRawData Collection')
                log_file.close()
                return

            if not os.path.isdir(self.fileFromDb):
                os.makedirs(self.fileFromDb)

            data = list()
            for row in conn.GoodRawData.find({}, {
                    "_id": 0,
                    "Comments": 1,
                    "Ratings": 1
            }):
                data.append({
                    'Comments': row['Comments'],
                    'Ratings': row['Ratings']
                })

            dataframe = pd.DataFrame(data, columns=['Comments', 'Ratings'])
            dataframe.to_csv(os.path.join(self.fileFromDb, self.fileName),
                             index=None)

            self.logger.log(log_file, 'CSV File Exported Successfully !!!')
            self.logger.log(
                log_file,
                'Successfully Executed selectingDatafromtableintocsv method of dbOperation class of dbOperation package'
            )
            log_file.close()

        except Exception as e:
            self.logger.log(log_file, "File exporting failed. Error : %s" % e)
            log_file.close()
Пример #12
0
 def __init__(self, path, file):
     self.path = path
     self.file = file
     self.schema_path = 'schema_prediction.json'
     self.logger = AppLogger()