Пример #1
0
class dataTransformPredict:
    """
          This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.
     """
    def __init__(self):
        self.goodDataPath = "Prediction_Good_Raw_Files_Validated"
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()

    def addQuotesToStringValuesInColumn(self):
        """
              Method Name: addQuotesToStringValuesInColumn
              Description: This method replaces the missing values in columns with "NULL" to
                           store in the table. We are using substring in the first column to
                           keep only "Integer" data for ease up the loading.
                           This column is anyways going to be removed during prediction.
          """

        try:
            log_file = 'dataTransformLog'
            onlyfiles = self.awsObj.listDirFiles(self.goodDataPath)
            for file in onlyfiles:
                data = self.awsObj.csvToDataframe(self.goodDataPath, file)
                data['stalk-root'] = data['stalk-root'].replace('?', "'?'")
                self.awsObj.saveDataframeToCsv(self.goodDataPath, file, data)
                self.logger.log(log_file,
                                " %s: Quotes added successfully!!" % file)
        except Exception as e:
            log_file = 'dataTransformLog'
            self.logger.log(log_file,
                            "Data Transformation failed because:: %s" % e)
            raise e
Пример #2
0
class dataTransform:

     """
          This class shall be used for transforming the Good Raw Training Data before loading it in Database!!.
     """

     def __init__(self):
          self.goodDataPath = "Training_Good_Raw_Files_Validated"
          self.logger = App_Logger()
          self.awsObj = AwsStorageManagement()


     def addQuotesToStringValuesInColumn(self):
          """
             Method Name: addQuotesToStringValuesInColumn
             Description: This method converts all the columns with string datatype such that
                         each value for that column is enclosed in quotes. This is done
                         to avoid the error while inserting string values in table as varchar.
          """

          log_file = 'addQuotesToStringValuesInColumn'
          try:
               onlyfiles = self.awsObj.listDirFiles(self.goodDataPath)
               for file in onlyfiles:
                    data = self.awsObj.csvToDataframe(self.goodDataPath, file)
                    for column in data.columns:
                         count = data[column][data[column] == '?'].count()
                         if count != 0:
                              data[column] = data[column].replace('?', "'?'")
                    self.awsObj.saveDataframeToCsv(self.goodDataPath, file, data)
                    self.logger.log(log_file," %s: Quotes added successfully!!" % file)
          except Exception as e:
               self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
class Data_Getter:
    """
    This class shall  be used for obtaining the data from the source for training.
    """
    def __init__(self, file_object, logger_object):
        self.training_file = 'InputFile.csv'
        self.file_object = file_object
        self.logger_object = logger_object
        self.awsObj = AwsStorageManagement()

    def get_data(self):
        """
            Method Name: get_data
            Description: This method reads the data from source.
            Output: A pandas DataFrame.
            On Failure: Raise Exception
        """
        self.logger_object.log(
            self.file_object,
            'Entered the get_data method of the Data_Getter class')
        try:
            print('Loading Dataframe')
            self.data = self.awsObj.csvToDataframe('Training_FileFromDB',
                                                   self.training_file)
            print('Dataframe Loaded')
            self.logger_object.log(
                self.file_object,
                'Data Load Successful.Exited the get_data method of the Data_Getter class'
            )
            return self.data
        except Exception as e:
            self.logger_object.log(
                self.file_object,
                'Exception occured in get_data method of the Data_Getter class. Exception message: '
                + str(e))
            self.logger_object.log(
                self.file_object,
                'Data Load Unsuccessful.Exited the get_data method of the Data_Getter class'
            )
            raise Exception()
Пример #4
0
class Prediction_Data_validation:
    """
        This class shall be used for handling all the validation done on the Raw Prediction Data!!.
    """
    def __init__(self, path):
        self.Batch_Directory = path
        self.schema_path = 'schema_prediction.json'
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()
        self.dbObj = mongoDBOperation()

    def valuesFromSchema(self):
        """
            Method Name: valuesFromSchema
            Description: This method extracts all the relevant information from the pre-defined "Schema" file.
            Output: LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, Number of Columns
            On Failure: Raise ValueError,KeyError,Exception
        """
        try:
            if not self.dbObj.isCollectionPresent('mushroomClassifierDB',
                                                  'predict_schema'):
                with open(self.schema_path, 'r') as f:
                    dic = json.load(f)
                    f.close()
                self.dbObj.insertOneRecord('mushroomClassifierDB',
                                           'predict_schema', dic)
            dic = self.dbObj.getRecords('mushroomClassifierDB',
                                        'predict_schema')
            pattern = dic['SampleFileName']
            LengthOfDateStampInFile = dic['LengthOfDateStampInFile']
            LengthOfTimeStampInFile = dic['LengthOfTimeStampInFile']
            column_names = dic['ColName']
            NumberofColumns = dic['NumberofColumns']

            file = 'valuesfromSchemaValidationLog'
            message = "LengthOfDateStampInFile:: %s" % LengthOfDateStampInFile + "\t" + "LengthOfTimeStampInFile:: %s" % LengthOfTimeStampInFile + "\t " + "NumberofColumns:: %s" % NumberofColumns + "\n"
            self.logger.log(file, message)

        except ValueError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(
                file, "ValueError:Value not found inside schema_training.json")
            raise ValueError

        except KeyError:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file,
                            "KeyError:Key value error incorrect key passed")
            raise KeyError

        except Exception as e:
            file = 'valuesfromSchemaValidationLog'
            self.logger.log(file, str(e))
            raise e

        return LengthOfDateStampInFile, LengthOfTimeStampInFile, column_names, NumberofColumns

    def manualRegexCreation(self):
        """
          Method Name: manualRegexCreation
          Description: This method contains a manually defined regex based on the "FileName" given in "Schema" file.
                      This Regex is used to validate the filename of the prediction data.
          Output: Regex pattern
          On Failure: None
        """
        regex = "['mushroom']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryForGoodBadRawData(self):
        """
            Method Name: createDirectoryForGoodBadRawData
            Description: This method creates directories to store the Good Data and Bad Data
                          after validating the prediction data.

            Output: None
            On Failure: Exception
        """
        try:
            self.awsObj.createS3Directory(
                'Prediction_Good_Raw_Files_Validated')
            self.awsObj.createS3Directory('Prediction_Bad_Raw_Files_Validated')
        except Exception as ex:
            file = 'GeneralLog'
            self.logger.log(file, "Error while creating Directory %s:" % ex)

    def deleteExistingGoodDataTrainingFolder(self):
        """
            Method Name: deleteExistingGoodDataTrainingFolder
            Description: This method deletes the directory made to store the Good Data
                          after loading the data in the table. Once the good files are
                          loaded in the DB,deleting the directory ensures space optimization.
            Output: None
            On Failure: Exception
        """
        try:
            file = 'GeneralLog'
            self.logger.log(file, "GoodRaw directory deleted successfully!!!")
            self.awsObj.deleteDirectory('Prediction_Good_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def deleteExistingBadDataTrainingFolder(self):
        """
            Method Name: deleteExistingBadDataTrainingFolder
            Description: This method deletes the directory made to store the bad Data.
            Output: None
            On Failure: Exception
        """

        try:
            file = 'GeneralLog'
            self.logger.log(
                file, "BadRaw directory deleted before starting validation!!!")
            self.awsObj.deleteDirectory('Prediction_Bad_Raw_Files_Validated')
        except Exception as s:
            file = 'GeneralLog'
            self.logger.log(file, "Error while Deleting Directory : %s" % s)
            raise s

    def moveBadFilesToArchiveBad(self):
        """
            Method Name: moveBadFilesToArchiveBad
            Description: This method deletes the directory made  to store the Bad Data
                          after moving the data in an archive folder. We archive the bad
                          files to send them back to the client for invalid data issue.
            Output: None
            On Failure: Exception
        """
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            target_folder = 'PredictionArchivedBadData/BadData_' + str(
                date) + "_" + str(time)
            self.awsObj.copyFileToFolder('Prediction_Bad_Raw_Files_Validated',
                                         target_folder)

            file = 'GeneralLog'
            self.logger.log(file, "Bad files moved to archive")

            self.logger.log(file, "Bad Raw Data Folder Deleted successfully!!")
        except Exception as e:
            file = 'GeneralLog'
            self.logger.log(file,
                            "Error while moving bad files to archive:: %s" % e)
            raise e

    def validationFileNameRaw(self, regex, LengthOfDateStampInFile,
                              LengthOfTimeStampInFile):
        """
            Method Name: validationFileNameRaw
            Description: This function validates the name of the prediction csv file as per given name in the schema!
                         Regex pattern is used to do the validation.If name format do not match the file is moved
                         to Bad Raw Data folder else in Good raw data.
            Output: None
            On Failure: Exception
        """
        # delete the directories for good and bad data in case last run was unsuccessful and folders were not deleted.
        self.deleteExistingBadDataTrainingFolder()
        self.deleteExistingGoodDataTrainingFolder()
        self.createDirectoryForGoodBadRawData()
        batch_dir = self.Batch_Directory.strip('/').strip('\\')
        print('Prediction File Path: ', batch_dir)
        self.awsObj.uploadFiles(batch_dir, batch_dir)
        onlyfiles = self.awsObj.listDirFiles(batch_dir)
        try:
            f = 'nameValidationLog'
            for filename in onlyfiles:
                if (re.match(regex, filename)):
                    splitAtDot = re.split('.csv', filename)
                    splitAtDot = (re.split('_', splitAtDot[0]))
                    if len(splitAtDot[1]) == LengthOfDateStampInFile:
                        if len(splitAtDot[2]) == LengthOfTimeStampInFile:
                            self.awsObj.copyFileToFolder(
                                batch_dir,
                                'Prediction_Good_Raw_Files_Validated',
                                filename)
                            self.logger.log(
                                f,
                                "Valid File name!! File moved to GoodRaw Folder :: %s"
                                % filename)

                        else:
                            self.awsObj.copyFileToFolder(
                                self.Batch_Directory,
                                'Prediction_Bad_Raw_Files_Validated', filename)
                            self.logger.log(
                                f,
                                "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                                % filename)
                    else:
                        self.awsObj.copyFileToFolder(
                            self.Batch_Directory,
                            'Prediction_Bad_Raw_Files_Validated', filename)
                        self.logger.log(
                            f,
                            "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                            % filename)
                else:
                    self.awsObj.copyFileToFolder(
                        self.Batch_Directory,
                        'Prediction_Bad_Raw_Files_Validated', filename)
                    self.logger.log(
                        f,
                        "Invalid File Name!! File moved to Bad Raw Folder :: %s"
                        % filename)

        except Exception as e:
            f = 'nameValidationLog'
            self.logger.log(f,
                            "Error occured while validating FileName %s" % e)
            raise e

    def validateColumnLength(self, NumberofColumns):
        """
            Method Name: validateColumnLength
            Description: This function validates the number of columns in the csv files.
                         It is should be same as given in the schema file.
                         If not same file is not suitable for processing and thus is moved to Bad Raw Data folder.
                         If the column number matches, file is kept in Good Raw Data for processing.
                        The csv file is missing the first column name, this function changes the missing name to "Wafer".
            Output: None
            On Failure: Exception
        """
        try:
            f = 'columnValidationLog'
            self.logger.log(f, "Column Length Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                if csv.shape[1] == NumberofColumns:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
                else:
                    self.awsObj.moveFileToFolder(
                        'Prediction_Good_Raw_Files_Validated',
                        'Prediction_Bad_Raw_Files_Validated', file)
                    self.logger.log(
                        f,
                        "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                        % file)

            self.logger.log(f, "Column Length Validation Completed!!")
        except OSError:
            f = 'columnValidationLog'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'columnValidationLog'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e

    def deletePredictionFile(self):

        self.awsObj.deleteFile('Prediction_Output_File', 'Predictions.csv')

    def validateMissingValuesInWholeColumn(self):
        """
              Method Name: validateMissingValuesInWholeColumn
              Description: This function validates if any column in the csv file has all values missing.
                           If all the values are missing, the file is not suitable for processing.
                           SUch files are moved to bad raw data.
              Output: None
              On Failure: Exception
        """
        try:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Missing Values Validation Started!!")
            file_list = self.awsObj.listDirFiles(
                'Prediction_Good_Raw_Files_Validated')
            for file in file_list:
                csv = self.awsObj.csvToDataframe(
                    'Prediction_Good_Raw_Files_Validated', file)
                count = 0
                for columns in csv:
                    if (len(csv[columns]) - csv[columns].count()) == len(
                            csv[columns]):
                        count += 1
                        self.awsObj.moveFileToFolder(
                            'Prediction_Good_Raw_Files_Validated',
                            'Prediction_Bad_Raw_Files_Validated', file)
                        self.logger.log(
                            f,
                            "Invalid Column Length for the file!! File moved to Bad Raw Folder :: %s"
                            % file)
                        break
                if count == 0:
                    self.awsObj.saveDataframeToCsv(
                        'Prediction_Good_Raw_Files_Validated', file, csv)
        except OSError:
            f = 'missingValuesInColumn'
            self.logger.log(
                f, "Error Occurred while moving the file :: %s" % OSError)
            raise OSError
        except Exception as e:
            f = 'missingValuesInColumn'
            self.logger.log(f, "Error Occurred:: %s" % e)
            raise e
class dBOperation:
    """
      This class shall be used for handling all the SQL operations.
    """
    def __init__(self):
        self.path = 'Training_Database'
        self.badFilePath = "Training_Bad_Raw_Files_Validated"
        self.goodFilePath = "Training_Good_Raw_Files_Validated"
        self.logger = App_Logger()
        self.awsObj = AwsStorageManagement()
        self.dbObj = mongoDBOperation()

    def createTableDb(self, DatabaseName, column_names):
        """
            Method Name: createTableDb
            Description: This method creates a table in the given database which will be used to insert the Good data after raw data validation.
            Output: None
            On Failure: Raise Exception
        """
        try:
            self.dbObj.createOrGetCollection(DatabaseName, 'Good_Raw_Data')
            file = 'DbTableCreateLog'
            self.logger.log(file, "Tables created successfully!!")

            file = 'DataBaseConnectionLog'
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)

        except Exception as e:
            file = 'DbTableCreateLog'
            self.logger.log(file, "Error while creating table: %s " % e)

            file = 'DataBaseConnectionLog'
            self.logger.log(file,
                            "Closed %s database successfully" % DatabaseName)
            raise e

    def insertIntoTableGoodData(self, Database):
        """
           Method Name: insertIntoTableGoodData
           Description: This method inserts the Good data files from the Good_Raw folder into the
                        above created table.
           Output: None
           On Failure: Raise Exception
        """

        goodFilePath = self.goodFilePath
        badFilePath = self.badFilePath
        onlyfiles = self.awsObj.listDirFiles(goodFilePath)
        log_file = 'DbInsertLog'
        self.dbObj.dropCollection(Database, 'Good_Raw_Data')
        print(onlyfiles)
        for file in onlyfiles:
            try:
                df_csv = self.awsObj.csvToDataframe(self.goodFilePath, file)
                print('df_csv: ', df_csv.shape)
                self.dbObj.dataframeToRecords(Database, 'Good_Raw_Data',
                                              df_csv)

            except Exception as e:
                self.logger.log(log_file,
                                "Error while creating table: %s " % e)
                self.awsObj.moveFileToFolder(goodFilePath, badFilePath, file)
                self.logger.log(log_file, "File Moved Successfully %s" % file)
        print('Data pushed to mongodb...')

    def selectingDatafromtableintocsv(self, Database):
        """
           Method Name: selectingDatafromtableintocsv
           Description: This method exports the data in GoodData table as a CSV file. in a given location.
                        above created .
           Output: None
           On Failure: Raise Exception
        """

        self.fileFromDb = 'Training_FileFromDB'
        self.fileName = 'InputFile.csv'
        self.awsObj.createS3Directory(self.fileFromDb)
        log_file = 'ExportToCsv'
        try:
            tmp_csv = self.dbObj.recordsToDataFrame(Database, 'Good_Raw_Data')
            self.awsObj.saveDataframeToCsv('Training_FileFromDB',
                                           self.fileName, tmp_csv)

            self.logger.log(log_file, "File exported successfully!!!")
            print('Saving data to final csv')

        except Exception as e:
            self.logger.log(log_file, "File exporting failed. Error : %s" % e)