Exemplo n.º 1
0
 def __init__(self, mainFilepath, additionalFilepath):
     self.raw_data = Raw_Data_Validation(mainFilepath, additionalFilepath)
     self.preproccesing_beforeDB = preprocessing_beforeDB()
     self.DbOperation = DBOperations()
     self.file_object = open('Prediction_Logs/Prediction_Main_Log.txt',
                             'a+')
     self.log_writer = App_Logger()
Exemplo n.º 2
0
 def __init__(self):
     self.path = "Prediction_Database/"
     self.goodRaw_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile"
     self.badRaw_MainFile_path = "Prediction_Raw_Validated_File/Bad_Raw_MainFile"
     self.goodRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile"
     self.badRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Bad_Raw_AdditionalFile"
     self.logger = App_Logger()
class preprocessing_beforeDB:
    def __init__(self):
        self.goodData_MainFile_path = "Training_Raw_Validated_File/Good_Raw_MainFile"
        self.goodData_AdditionalFile_path = "Training_Raw_Validated_File/Good_Raw_AdditionalFile"
        self.logger = App_Logger()

    def replaceMissingWithNull_MainFile(self):
        try:
            f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+")
            only_files = [f for f in os.listdir(self.goodData_MainFile_path)]
            for file in only_files:
                csv = pd.read_csv(self.goodData_MainFile_path + "/" + file)
                csv.fillna('NULL',inplace=True)
                csv.to_csv(self.goodData_MainFile_path + "/" + file,index=None,header=True)
                self.logger.log(f,'Replace Missing values with Null Values in Good Raw Main File Successfully !!')
            f.close()
        except Exception as e:
            f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+")
            self.logger.log(f,'Replace missing with Null Values failed in Main File becasue:: %s' % str(e))
            f.close()

    def replaceMissingWithNull_AdditionalFile(self):
        f = open("Training_Logs/data_preprocessing_beforeDB.txt","a+")
        try:
            only_files = [f for f in os.listdir(self.goodData_AdditionalFile_path)]
            for file in only_files:
                csv = pd.read_csv(self.goodData_AdditionalFile_path + "/" + file)
                csv.fillna('NULL',inplace=True)
                csv.to_csv(self.goodData_AdditionalFile_path + "/" + file,index=None,header=True)
                self.logger.log(f,'Replace Missing values with Null Values in Additional Raw Main File Successfully !!')
        except Exception as e:
            self.logger.log(f,'Replace missing with Null Values failed in Additional File becasue:: %s' % e)
            f.close()
        f.close()
Exemplo n.º 4
0
class TrainModel:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+')

    def train_model(self):
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            main_data, additional_data = data_getter.get_data()

            preprocessor = data_preprocessing.PreProcessor(
                self.file_object, self.log_writer)
            is_null_present = preprocessor.is_null_present(main_data)
            if is_null_present == True:
                main_data = preprocessor.impute_missing_values(main_data)
            main_data = preprocessor.map_ip_to_country(main_data,
                                                       additional_data)
            main_data = preprocessor.difference_signup_and_purchase(main_data)
            main_data = preprocessor.encoding_browser(main_data)
            main_data = preprocessor.encoding_source(main_data)
            main_data = preprocessor.encoding_sex(main_data)
            main_data = preprocessor.count_frequency_encoding_country(
                main_data)
            main_data = preprocessor.remove_unwanted_cols(main_data)
            x, y = preprocessor.separate_label_feature(main_data, 'class')

            x_train, x_test, y_train, y_test = train_test_split(x,
                                                                y,
                                                                test_size=0.3)

            #x_train,y_train = preprocessor.over_sampling_smote(x_train,y_train)

            model_finder = tuner.Model_Finder(self.file_object,
                                              self.log_writer)
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)

            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            save_model = file_op.save_model(best_model, best_model_name)

            self.log_writer.log(self.file_object,
                                'Successfull End of Training')
            self.file_object.close()

        except Exception as e:
            self.log_writer.log(self.file_object,
                                'Unsuccessfull End of Training')
            self.file_object.close()
            raise e
Exemplo n.º 5
0
class Prediction_Row:
    def __init__(self):
        self.log_writer = App_Logger()
        self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+')
        # self.datarow = pd.DataFrame({'signup_time': self.signup_time, 'purchase_time': self.purchase_time,
        #                              'purchase_value': self.purchase_value, 'source': self.source,
        #                              'browser': self.browser, 'sex': self.sex, 'age': self.age,
        #                              'ip_address': self.ip_address})
        #print(self.datarow)

    def predictRow(self, datarow):
        self.log_writer.log(self.file_object, 'Start of DataRow Prediction')
        self.datarow = datarow
        try:
            preprocessor = data_preprocessing.PreProcessorRow(
                self.datarow, self.file_object, self.log_writer)
            self.datarow = preprocessor.row_map_ip_to_country(self.datarow)
            self.datarow = preprocessor.row_difference_signup_and_purchase(
                self.datarow)
            self.datarow = preprocessor.row_encoding_browser(self.datarow)
            self.datarow = preprocessor.row_encoding_source(self.datarow)
            self.datarow = preprocessor.row_encoding_sex(self.datarow)
            self.datarow = preprocessor.row_count_frequency_encoding_country(
                self.datarow)
            self.datarow = preprocessor.row_remove_unwanted_cols(self.datarow)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            model_name = file_loader.find_correct_model_file()
            model = file_loader.load_model(model_name)
            self.datarow['purchase_value'] = pd.to_numeric(
                self.datarow['purchase_value'])
            self.datarow['age'] = pd.to_numeric(self.datarow['age'])
            self.datarow = self.datarow.reindex([
                'purchase_value', 'age', 'd_day', 'd_hour', 'd_minutes',
                'd_seconds', 'FireFox', 'IE', 'Opera', 'Safari', 'Direct',
                'SEO', 'M', 'country_encode'
            ],
                                                axis=1)
            result = model.predict(self.datarow)

            self.log_writer.log(self.file_object,
                                'Successfull End of DataRow Prediction')
            self.file_object.close()
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error Occured while doing the DataRaw Prediction !! Error :: %s'
                % str(e))
            self.file_object.close()
            raise e
        return str(result[0])
 def __init__(self):
     self.goodData_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile"
     self.goodData_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile"
     self.logger = App_Logger()
 def __init__(self, mainfile_path, additionalfile_path):
     self.batch_directory_MainFile = mainfile_path
     self.batch_directory_AdditionalFile = additionalfile_path
     self.schema_path = 'schema_Training.json'
     self.logger = App_Logger()
class Raw_Data_Validation:
    def __init__(self, mainfile_path, additionalfile_path):
        self.batch_directory_MainFile = mainfile_path
        self.batch_directory_AdditionalFile = additionalfile_path
        self.schema_path = 'schema_Training.json'
        self.logger = App_Logger()

    def fetch_values_from_schema(self):
        try:
            with open(self.schema_path, 'r') as r:
                dic = json.load(r)
                r.close()
            main_file = dic['SampleFileName_Main']
            additional_file = dic['SampleFileName_Additional']
            main_lengthofdatestampinfile = dic['Main_LengthOfDateStampInFile']
            additional_lengthofdatestampinfile = dic[
                'Additional_LengthOfDateStampInFile']
            main_lengthoftimestampinfile = dic['Main_LengthOfTimeStampInFile']
            additional_lengthoftimestampinfile = dic[
                'Additional_LengthOfTimeStampInFile']
            no_col_mainfile = dic['NumberOfColumns_MainFile']
            no_col_additionalfile = dic['NumberOfColumns_AdditionalFile']
            mainfile_col_name = dic['MainFile_ColName']
            additionalfile_colname = dic['AdditionalFile_ColName']

            file = open('Training_Logs/valuesfromschema_Validation_Log.txt',
                        'a+')
            message = "Number of Columns in Main File:: %s" % mainfile_col_name + "Number of Columns in Additional File:: %s" % additionalfile_colname + "\n" + "MainFile Length of DateStamp::%s" % main_lengthofdatestampinfile + "\n" + "MainFile Length of TimeStamp:: %s" % main_lengthoftimestampinfile
            self.logger.log(file, message)
            file.close()
        except ValueError:
            file = open('Training_Logs/valuesfromschema_Validation_Log.txt',
                        'a+')
            self.logger.log(
                file,
                'Value Error : Value not Found inside schema_Training.json')
            file.close()
            raise ValueError

        except KeyError:
            file = open('Training_Logs/valuesfromschema_Validation_Log.txt',
                        'a+')
            self.logger.log(
                file, 'Key Error : Key Value Error Incorrect Key Passed !!')
            file.close()
            raise KeyError

        except Exception as e:
            file = open('Training_Logs/valuesfromschema_Validation_Log.txt',
                        'a+')
            self.logger.log(file, str(e))
            file.close()
            raise e
        return main_file, additional_file, main_lengthofdatestampinfile, main_lengthoftimestampinfile, additional_lengthofdatestampinfile, additional_lengthoftimestampinfile, mainfile_col_name, additionalfile_colname, no_col_mainfile, no_col_additionalfile

    def mainfile_manualRegexCreation(self):
        regex = "['Fraud_Data_']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def additionalfile_manualRegexCreation(self):
        regex = "['IPAddress_To_Country_']+['\_'']+[\d_]+[\d]+\.csv"
        return regex

    def createDirectoryFor_GoodBadRawData_MainFile(self):
        try:
            path = os.path.join("Training_Raw_Validated_File/",
                                "Good_Raw_MainFile/")
            if not os.path.isdir(path):
                os.makedirs(path)

            path = os.path.join("Training_Raw_Validated_File/",
                                "Bad_Raw_MainFile/")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(
                file,
                'Error while creating MainFile Good and Bad Directory %s' % ex)
            file.close()
            raise OSError

    def createDirectoryFor_GoodBadRawData_AdditionalFile(self):
        try:
            path = os.path.join("Training_Raw_Validated_File/",
                                "Good_Raw_AdditionalFile")
            if not os.path.isdir(path):
                os.makedirs(path)

            path = os.path.join("Training_Raw_Validated_File/",
                                "Bad_Raw_AdditionalFile")
            if not os.path.isdir(path):
                os.makedirs(path)

        except OSError as ex:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(
                file,
                'Error while creating Additional Good and Bad Directory %s' %
                ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataTrainingDir_MainFile(self):
        try:
            path = "Training_Raw_Validated_File/"
            if os.path.isdir(path + 'Good_Raw_MainFile/'):
                shutil.rmtree(path + 'Good_Raw_MainFile/')
                file = open('Training_Logs/General_Log.txt', 'a+')
                self.logger.log(
                    file,
                    'Good Raw Main File Directory deleted Sucessfully !!!')
                file.close()

        except OSError as ex:
            file = open('Training_Logs/General_Log.txt', 'a+')
            self.logger.log(
                file,
                'Error while deleting Main File Good Raw Directory: %s' % ex)
            file.close()
            raise OSError

    def deleteExistingGoodDataTrainingDir_AdditionalFile(self):
        try:
            path = "Training_Raw_Validated_File/"
            if os.path.isdir(path + 'Good_Raw_AdditionalFile/'):
                shutil.rmtree(path + 'Good_Raw_AdditionalFile/')
                file = open('Training_Logs/General_Log.txt', 'a+')
                self.logger.log(
                    file,
                    'Good Raw Main File Directory deleted Sucessfully !!!')
                file.close()

        except OSError as ex:
            file = open('Training_Logs/General_Log.txt', 'a+')
            self.logger.log(file,
                            'Error while deleting Good Raw Directory: %s' % ex)
            file.close()
            raise OSError

    def deleteExistingBadDataTrainingDir_MainFile(self):
        try:
            path = "Training_Raw_Validated_File/"
            if os.path.isdir(path + 'Bad_Raw_MainFile/'):
                shutil.rmtree(path + 'Bad_Raw_MainFile/')
                file = open('Training_Logs/General_Log.txt', 'a+')
                self.logger.log(
                    file,
                    'Bad Raw Additional Directory deleted Sucessfully !!!')
                file.close()

        except OSError as ex:
            file = open('Training_Logs/General_Log.txt', 'a+')
            self.logger.log(
                file,
                'Error while deleting Main File Bad Raw Directory: %s' % ex)
            file.close()
            raise OSError

    def deleteExistingBadDataTrainingDir_AdditionalFile(self):
        try:
            path = "Training_Raw_Validated_File/"
            if os.path.isdir(path + 'Bad_Raw_AdditionalFile/'):
                shutil.rmtree(path + 'Bad_Raw_AdditionalFile/')
                file = open('Training_Logs/General_Log.txt', 'a+')
                self.logger.log(
                    file,
                    'Bad Raw Additional Directory deleted Sucessfully !!!')
                file.close()
        except OSError as ex:
            file = open('Training_Logs/General_Log.txt', 'a+')
            self.logger.log(
                file,
                'Error while deleting Additional Bad Raw Directory: %s' % ex)
            file.close()
            raise OSError

    def moveBadFilesToArchiveBad_MainFile(self):
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            source = 'Training_Raw_Validated_File/Bad_Raw_MainFile/'
            if os.path.isdir(source):
                path = 'TrainingArchiveBadData_MainFile'
                if not os.path.isdir(path):
                    os.makedirs(path)
                destination = 'TrainingArchiveBadData_MainFile/Bad_Data_' + str(
                    date) + "_" + str(time)

                if not os.path.isdir(destination):
                    os.makedirs(destination)
                files = os.listdir(source)

                for f in files:
                    if f not in os.listdir(destination):
                        shutil.move(source + f, destination)
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(file, 'Bad Main files moved to archive')
            path = "Training_Raw_Validated_File"
            if os.path.isdir(path + 'Bad_Raw_MainFile/'):
                shutil.rmtree(path + 'Bad_Raw_MainFile/')
            self.logger.log(
                file,
                'Bad Raw Main Files Data Directory Removed Successfully!!')
            file.close()

        except Exception as e:
            file = open("Training_Logs/General_Log.txt", 'a+')
            self.logger.log(
                file, 'Error while moving bad main files to Archive::%s' % e)
            file.close()
            raise e

    def moveBadFilesToArchiveBad_AdditionalFile(self):
        now = datetime.now()
        date = now.date()
        time = now.strftime("%H%M%S")
        try:
            source = 'Training_Raw_Validated_File/Bad_Raw_AdditionalFile/'
            if os.path.isdir(source):
                path = 'TrainingArchiveBadData_AdditionalFile'
                if not os.path.isdir(path):
                    os.makedirs(path)
                destination = 'TrainingArchiveBadData_AdditionalFile/Bad_Data_' + str(
                    date) + "_" + str(time)

                if not os.path.isdir(destination):
                    os.makedirs(destination)
                files = os.listdir(source)

                for f in files:
                    if f not in os.listdir(destination):
                        shutil.move(source + f, destination)
            file = open("Training_Logs/General_Log.txt", 'a+')
            self.logger.log(file, 'Bad Additional files moved to archive')
            path = "Training_Raw_Validated_File"
            if os.path.isdir(path + 'Bad_Raw_AdditionalFile/'):
                shutil.rmtree(path + 'Bad_Raw_AdditionalFile/')
            self.logger.log(
                file,
                'Bad Raw Additional Files Data Directory Removed Successfully!!'
            )
            file.close()

        except Exception as e:
            file = open("Training_Logs/GeneralLog.txt", 'a+')
            self.logger.log(
                file, 'Error while moving bad main files to Archive::%s' % e)
            file.close()
            raise e

    def validationFileNameRaw_MainFile(self, mainfile_Regex,
                                       main_lengthofdatestampinfile,
                                       main_lengthoftimestampinfile):
        self.deleteExistingBadDataTrainingDir_MainFile()
        self.deleteExistingGoodDataTrainingDir_MainFile()
        self.createDirectoryFor_GoodBadRawData_MainFile()

        onlyfiles = [f for f in os.listdir(self.batch_directory_MainFile)]

        try:
            file = open("Training_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(mainfile_Regex, filename)):
                    split = re.split('.csv', filename)
                    split = re.split('_', split[0])
                    if len(split[2]) == main_lengthofdatestampinfile:
                        if len(split[3]) == main_lengthoftimestampinfile:
                            shutil.copy(
                                "Training_Batch_Files/Main_File/" + filename,
                                "Training_Raw_Validated_File/Good_Raw_MainFile"
                            )
                            self.logger.log(
                                file,
                                'Valid File Name !! File moved to GoodRaw_Main Directory ::%s'
                                % filename)
                        else:
                            shutil.copy(
                                "Training_Batch_Files/Main_File/" + filename,
                                "Training_Raw_Validated_File/Bad_Raw_MainFile")
                            self.logger.log(
                                file,
                                'Invalid File Name!! File moved to Bad Raw Main File Directory'
                            )
                    else:
                        shutil.copy(
                            "Training_Batch_Files/Main_File/" + filename,
                            "Training_Raw_Validated_File/Bad_Raw_MainFile")
                        self.logger.log(
                            file,
                            'Invalid File Name!! File moved to Bad Raw Main File Directory'
                        )
                else:
                    shutil.copy(
                        "Training_Batch_Files/Main_File/" + filename,
                        "Training_Raw_Validated_File/Bad_Raw_MainFile")
                    self.logger.log(
                        file,
                        'Invalid File Name!! File moved to Bad Raw Main File Directory'
                    )
            file.close()
        except Exception as e:
            file = open("Training_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(
                file, "Error occured while validating Main FileName %s" % e)
            file.close()
            raise e

    def validationFileNameRaw_AdditionalFile(
            self, additionalfile_Regex, additionalfile_lengthofdatestampinfile,
            additionalfile_lengthoftimestampinfile):
        self.deleteExistingBadDataTrainingDir_AdditionalFile()
        self.deleteExistingGoodDataTrainingDir_AdditionalFile()
        self.createDirectoryFor_GoodBadRawData_AdditionalFile()

        onlyfiles = [
            f for f in os.listdir(self.batch_directory_AdditionalFile)
        ]

        try:
            file = open("Training_Logs/nameValidationLog.txt", 'a+')
            for filename in onlyfiles:
                if (re.match(additionalfile_Regex, filename)):
                    split = re.split('.csv', filename)
                    split = re.split('_', split[0])
                    if len(split[3]) == additionalfile_lengthofdatestampinfile:
                        if len(split[4]
                               ) == additionalfile_lengthoftimestampinfile:
                            shutil.copy(
                                "Training_Batch_Files/Additional_File/" +
                                filename,
                                "Training_Raw_Validated_File/Good_Raw_AdditionalFile"
                            )
                            self.logger.log(
                                file,
                                'Valid File Name !! File moved to GoodRaw_Additional Directory ::%s'
                                % filename)
                        else:
                            shutil.copy(
                                "Training_Batch_Files/Additional_File/" +
                                filename,
                                "Training_Raw_Validated_File/Bad_Raw_AdditionalFile"
                            )
                            self.logger.log(
                                file,
                                'Invalid File Name!! File moved to Bad Raw Additional File Directory'
                            )
                    else:
                        shutil.copy(
                            "Training_Batch_Files/Additional_File/" + filename,
                            "Training_Raw_Validated_File/Bad_Raw_AdditionalFile"
                        )
                        self.logger.log(
                            file,
                            'Invalid File Name!! File moved to Bad Raw Additional File Directory'
                        )
                else:
                    shutil.copy(
                        "Training_Batch_Files/Additional_File/" + filename,
                        "Training_Raw_Validated_File/Bad_Raw_AdditionalFile")
                    self.logger.log(
                        file,
                        'Invalid File Name!! File moved to Bad Raw Additional File Directory'
                    )
            file.close()
        except Exception as e:
            file = open("Training_Logs/nameValidationLog.txt", 'a+')
            self.logger.log(
                file,
                "Error occured while validating Additional FileName %s" % e)
            file.close()
            raise e

    def validate_NoOfCol_MainFile(self, noofcol_mainfile):
        try:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            for file in os.listdir(
                    'Training_Raw_Validated_File/Good_Raw_MainFile/'):
                csv = pd.read_csv(
                    'Training_Raw_Validated_File/Good_Raw_MainFile/' + file)
                if csv.shape[1] == noofcol_mainfile:
                    pass
                else:
                    shutil.move(
                        'Training_Raw_Validated_File/Good_Raw_MainFile' + file,
                        'Training_Raw_Validated_File/Bad_Raw_MainFile')
                    self.logger.log(
                        f,
                        'Invalid Column length for the file !! File moved to bad raw main Directory :: %s'
                        % file)
                self.logger.log(
                    f, 'Main File Columns Length Validated Sucessfully')
            f.close()
        except OSError:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(
                f, 'Error Occured while moving file :: %s' % str(OSError))
            f.close()
            raise OSError

        except Exception as e:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e

    def validate_NoOfCol_AdditionalFile(self, noofcol_additionalfile):
        try:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            for file in os.listdir(
                    'Training_Raw_Validated_File/Good_Raw_AdditionalFile/'):
                csv = pd.read_csv(
                    'Training_Raw_Validated_File/Good_Raw_AdditionalFile/' +
                    file)
                if csv.shape[1] == noofcol_additionalfile:
                    pass
                else:
                    shutil.move(
                        'Training_Raw_Validated_File/Good_Raw_AdditionalFile' +
                        file,
                        'Training_Raw_Validated_File/Bad_Raw_AdditionalFile')
                    self.logger.log(
                        f,
                        'Invalid Column length for the file !! File moved to bad raw additional Directory :: %s'
                        % file)
                self.logger.log(
                    f, 'Additional File Columns Length Validated Sucessfully')
            f.close()
        except OSError:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(
                f, 'Error Occured while moving file :: %s' % str(OSError))
            f.close()
            raise OSError

        except Exception as e:
            f = open("Training_Logs/columnValidationLog.txt", 'a+')
            self.logger.log(f, "Error Occured:: %s" % e)
            f.close()
            raise e
 def __init__(self, mainFilePath, additionalFilePath):
     self.log_writer = App_Logger()
     self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+')
     if mainFilePath is not None and additionalFilePath is not None:
         self.pred_data_val = Raw_Data_Validation(mainFilePath,
                                                  additionalFilePath)
 def __init__(self):
     self.log_writer = App_Logger()
     self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+')
class Prediction:
    def __init__(self, mainFilePath, additionalFilePath):
        self.log_writer = App_Logger()
        self.file_object = open('Prediction_Logs/PredictionLog.txt', 'a+')
        if mainFilePath is not None and additionalFilePath is not None:
            self.pred_data_val = Raw_Data_Validation(mainFilePath,
                                                     additionalFilePath)

    def predict_from_model(self):
        self.log_writer.log(self.file_object, 'Start of Prediction')
        try:
            self.pred_data_val.deletePredictionFile()
            data_getter = data_loader_prediction.Data_Getter(
                self.file_object, self.log_writer)
            main_data, additional_data = data_getter.get_data()

            preprocessor = data_preprocessing.PreProcessor(
                self.file_object, self.log_writer)
            is_null_present = preprocessor.is_null_present(main_data)
            if is_null_present == True:
                main_data = preprocessor.impute_missing_values(main_data)
            main_data = preprocessor.map_ip_to_country(main_data,
                                                       additional_data)
            main_data = preprocessor.difference_signup_and_purchase(main_data)
            main_data = preprocessor.encoding_browser(main_data)
            main_data = preprocessor.encoding_source(main_data)
            main_data = preprocessor.encoding_sex(main_data)
            main_data = preprocessor.count_frequency_encoding_country(
                main_data)
            main_data, unwanted_data = preprocessor.remove_unwanted_cols(
                main_data, return_unwanted_data=True)
            #x,y = preprocessor.separate_label_feature(main_data,'class')

            #x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3)

            #x_train,y_train = preprocessor.over_sampling_smote(x_train,y_train)

            #model_finder = tuner.Model_Finder(self.file_object,self.log_writer)
            #best_model_name,best_model = model_finder.get_best_model(x_train,y_train,x_test,y_test)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            #save_model = file_op.save_model(best_model,best_model_name)
            model_name = file_loader.find_correct_model_file()
            model = file_loader.load_model(model_name)
            result = list(model.predict(main_data))
            data = list(
                zip(unwanted_data['user_id'], unwanted_data['signup_time'],
                    unwanted_data['purchase_time'], unwanted_data['device_id'],
                    unwanted_data['source'], unwanted_data['browser'],
                    unwanted_data['sex'], unwanted_data['ip_address'],
                    unwanted_data['Country'], result))
            result = pd.DataFrame(data,
                                  columns=[
                                      'user_id', 'signup_time',
                                      'purchase_time', 'device_id', 'source',
                                      'browser', 'sex', 'ip_address',
                                      'Country', 'Prediction'
                                  ])
            path = "Prediction_Output_File/Prediction.csv"
            result.to_csv(path, header=True, mode='a+')
            self.log_writer.log(self.file_object,
                                'Successfull End of Prediction')
            self.file_object.close()
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error Occured while doing the Prediction !! Error :: %s' %
                str(e))
            self.file_object.close()
            raise e
        return path, result.head().to_json(orient="records")
class Train_Validation:
    def __init__(self, mainFilepath, additionalFilepath):
        self.raw_data = Raw_Data_Validation(mainFilepath, additionalFilepath)
        self.preproccesing_beforeDB = preprocessing_beforeDB()
        self.DbOperation = DBOperations()
        self.file_object = open('Training_Logs/Training_Main_Log.txt', 'a+')
        self.log_writer = App_Logger()

    def training_validation(self):
        try:
            self.log_writer.log(self.file_object,
                                'Start of Raw Data Validation on Files !!')
            main_file, additional_file, mainFile_LengthofDataStampInFile, mainFile_LengthofTimeStampInFile, additional_LengthofDateStampInFile, additional_LengthofTimeStampInFile, mainFile_ColName, additionalFile_ColName, NoCol_MainFile, NoCol_AdditionalFile = self.raw_data.fetch_values_from_schema(
            )
            mainFile_regex = self.raw_data.mainfile_manualRegexCreation()
            additionalFile_regex = self.raw_data.additionalfile_manualRegexCreation(
            )
            self.raw_data.validationFileNameRaw_MainFile(
                mainFile_regex, mainFile_LengthofDataStampInFile,
                mainFile_LengthofTimeStampInFile)
            self.raw_data.validationFileNameRaw_AdditionalFile(
                additionalFile_regex, additional_LengthofDateStampInFile,
                additional_LengthofTimeStampInFile)
            self.raw_data.validate_NoOfCol_MainFile(NoCol_MainFile)
            self.raw_data.validate_NoOfCol_AdditionalFile(NoCol_AdditionalFile)
            self.log_writer.log(self.file_object,
                                'Raw Data Validation Completed !!')
            self.log_writer.log(self.file_object,
                                'Start of Data Preprocessing before DB')
            self.preproccesing_beforeDB.replaceMissingWithNull_MainFile()
            self.preproccesing_beforeDB.replaceMissingWithNull_AdditionalFile()
            self.log_writer.log(self.file_object,
                                'Data Preprocessing before DB Completed !!')
            self.log_writer.log(
                self.file_object,
                'Start of Creating TrainingDatabase and Table based on given schema!!!'
            )
            self.DbOperation.createTable_MainFile('Training', mainFile_ColName)
            self.DbOperation.createTable_AdditionalFile(
                'Training', additionalFile_ColName)
            self.log_writer.log(
                self.file_object,
                'Creation of Table in Database Successfull !!!')
            self.log_writer.log(self.file_object,
                                "Insertion of Data into Table started!!!!")
            self.DbOperation.InsertIntoTableGoodData_MainFile('Training')
            self.DbOperation.InsertIntoTableGoodData_AdditionalFile('Training')
            self.log_writer.log(self.file_object,
                                "Insertion of Data into Tables Completed!!!!")
            self.log_writer.log(
                self.file_object,
                "Deleting Main and Additional File Good Data Folder!!!")
            self.raw_data.deleteExistingGoodDataTrainingDir_MainFile()
            self.raw_data.deleteExistingGoodDataTrainingDir_AdditionalFile()
            self.log_writer.log(
                self.file_object,
                'Main and Additional Good File Directory Deleted !!!')
            self.log_writer.log(
                self.file_object,
                'Starting moving bad files to Archive and deleting bad data directory'
            )
            self.raw_data.moveBadFilesToArchiveBad_MainFile()
            self.raw_data.moveBadFilesToArchiveBad_AdditionalFile()
            self.log_writer.log(
                self.file_object,
                'Bad Files moved to Archive!! and Bad Directory Deleted !!')
            self.log_writer.log(self.file_object,
                                'Raw Data Validation Completed Successfully')
            self.log_writer.log(self.file_object,
                                'Exporting Data Into CSV File Started')
            self.DbOperation.SelectingDataFromTableIntoCSV_MainFile('Training')
            self.DbOperation.SelectingDataFromTableIntoCSV_AdditionalFile(
                'Training')
            self.log_writer.log(self.file_object,
                                'Data to CSV File Exported Successfull')
            self.log_writer.log(self.file_object,
                                'End of Raw Data Validation!!!')
            self.file_object.close()
        except Exception as e:
            raise e
Exemplo n.º 13
0
 def __init__(self):
     self.log_writer = App_Logger()
     self.file_object = open('Training_Logs/ModelTrainingLog.txt', 'a+')
Exemplo n.º 14
0
def preprocess_and_split(config_path):
    file_object = open('Training_log.txt', 'a+')
    logger = App_Logger()
    config = read_params(config_path)

    train_data_path = config["split_data"]["train_path"]
    raw_train_data_path = config["load_data"]["raw_train_data_csv"]
    logger.log(file_object, "Training Data load was successful")

    train_df = pd.read_csv(raw_train_data_path)
    logger.log(file_object, "Data reading successful")

    # 1.Function for extracting features from date column
    train_df = date_process(
        train_df)  # function  for datetime cols processing in train data
    logger.log(file_object, "Datetime Processing in train data completed ")

    # 2. Function to validate the columns in the dataset for json datatype
    train_json_columns = column_validator(
        train_df
    )  # Validating the columns in the train dataset for json datatype
    logger.log(file_object, "Column_validator successful")

    # 2.1 Function for flattening the json columns and merge them with original dataset
    if train_json_columns is not None:
        train_df = json_to_df(
            train_df,
            train_json_columns)  #Normalizing the json columns in train data
        target = train_df['transactionRevenue']
        logger.log(file_object, "Normalizing the json columns completed")

# 3.Dropping columns which have more than 50% of null values and columns not contributing to the target variable
    train_df = remove_nan_cols(train_df)
    logger.log(file_object, "50% NAN value columns are removed")
    train_df.drop(
        'sessionId', axis=1, inplace=True
    )  # Removing this column as  it is the  combination of fullVisitorId and visitId
    train_df.drop(
        'visitStartTime', axis=1,
        inplace=True)  # Removing this column as it is extracted into visitHour
    train_df.drop(
        'fullVisitorId', axis=1, inplace=True
    )  # This column is very long and of no much contribution towards target variable
    #drop_columns = ['visitId', 'weekday', 'day', 'bounces', 'keyword']
    drop_columns = ['visitId', 'weekday', 'day']
    train_df.drop(drop_columns, axis=1, inplace=True)
    logger.log(
        file_object,
        'Dropped columns which are not contributing to the transaction revenue'
    )

    # 4.Imputation of null values
    train_df = pd.concat(
        [train_df, target], axis=1
    )  # transactionRevenue col is attached to the dataframe for imputing nan with 0
    train_df = impute_na(train_df)
    logger.log(file_object, "Imputing NAN values with 0 is completed")

    # 5.Changing datatypes from object to desired ones
    train_df = data_type_convert(train_df)
    logger.log(file_object, "Conversion of Datatype to int completed")

    # 6. Removing columns with constant values or with zero standard deviation
    train_df = remove_zero_std_cols(train_df)
    logger.log(file_object, "Zero standard deviation columns are removed")

    # 7 Function to gather categorical columns in the dataset and performing label encoding
    label_cols = categorical_cols(train_df)
    logger.log(file_object,
               "Gathering of label _cols in train data completed ")

    train_df = label_encoding(train_df, label_cols)
    logger.log(file_object, "Label_encoding in train data completed ")

    # 8. Imputing pageviews column with KNNImputer in train data

    from sklearn.impute import KNNImputer
    imputer = KNNImputer()

    imputer_train_df = imputer.fit_transform(train_df[[
        'pageviews'
    ]])  ## Imputing pageviews with KNNimputer in training data
    train_df['pageviews'] = imputer_train_df

    logger.log(file_object, "Pageviews column imputed with KNNimputer")
    train_df.to_csv(train_data_path, sep=",", index=False,
                    encoding="utf-8")  ## Storing Processed train data
    logger.log(
        file_object,
        "Training data is processed and stored as data/processed/train_processed.csv"
    )
    file_object.close()
Exemplo n.º 15
0
class DBOperations:
    def __init__(self):
        self.path = "Prediction_Database/"
        self.goodRaw_MainFile_path = "Prediction_Raw_Validated_File/Good_Raw_MainFile"
        self.badRaw_MainFile_path = "Prediction_Raw_Validated_File/Bad_Raw_MainFile"
        self.goodRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Good_Raw_AdditionalFile"
        self.badRaw_AdditionalFile_path = "Prediction_Raw_Validated_File/Bad_Raw_AdditionalFile"
        self.logger = App_Logger()

    def DatabaseConnection(self,database_name):
        try:
            con = sqlite3.connect(self.path + database_name + '.db')
            file = open('Prediction_Logs/DataBaseConnection.txt','a+')
            self.logger.log(file,'Database Connection to %s Successfully' % database_name + '.db')
            file.close()
        except ConnectionError:
            file = open('Prediction_Logs/DataBaseConnection.txt','a+')
            self.logger.log(file,'Error while connecting to database:: %s' % str(ConnectionError))
            file.close()
            raise ConnectionError
        return con

    def createTable_MainFile(self,database,colname_MainFile):
        try:
            con = self.DatabaseConnection(database)

            c = con.cursor()
            c.execute("SELECT count(name)  FROM sqlite_master WHERE type = 'table' AND name = 'MainFile_Good_Raw_Data'")
            if c.fetchone()[0] == 1:
                con.close()
                file = open('Prediction_Logs/DbTableCreateLog.txt', 'a+')
                self.logger.log(file,'MainFile_Good_Raw_Data Table Created Successfully !!')
                file.close()
            else:
                for key in colname_MainFile.keys():
                    type = colname_MainFile[key]
                    try:
                        con.execute('ALTER TABLE MainFile_Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type))
                    except:
                        con.execute('CREATE TABLE MainFile_Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type))
                con.close()

        except Exception as e:
            file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+')
            self.logger.log(file,'Error while creating Table:: %s' % e)
            file.close()
            con.close()
            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file, "Closed %s database successfully" % database)
            file.close()
            raise e

    def createTable_AdditionalFile(self,database,colname_AdditionalFile):
        try:
            con = self.DatabaseConnection(database)
            c = con.cursor()
            c.execute("SELECT count(name)  FROM sqlite_master WHERE type = 'table' AND name = 'AdditionalFile_Good_Raw_Data'")
            if c.fetchone()[0] == 1:
                con.close()
                file = open('Prediction_Logs/DbTableCreateLog.txt', 'a+')
                self.logger.log(file,'AdditionalFile_Good_Raw_Data Table Created Successfully !!')
                file.close()
            else:
                for key in colname_AdditionalFile.keys():
                    type = colname_AdditionalFile[key]
                    try:
                        con.execute('ALTER TABLE AdditionalFile_Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type))
                    except:
                        con.execute('CREATE TABLE AdditionalFile_Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type))
                con.close()

        except Exception as e:
            file = open("Prediction_Logs/DbTableCreateLog.txt", 'a+')
            self.logger.log(file,'Error while creating Table:: %s' % e)
            file.close()
            con.close()
            file = open("Prediction_Logs/DataBaseConnectionLog.txt", 'a+')
            self.logger.log(file, "Closed %s database successfully" % database)
            file.close()
            raise e

    def InsertIntoTableGoodData_MainFile(self,database):
        con = self.DatabaseConnection(database)
        c = con.cursor()
        MainFile_goodDataPath = self.goodRaw_MainFile_path
        only_files = [f for f in os.listdir(MainFile_goodDataPath)]
        log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+')
        for file in only_files:
            try:
                with open(MainFile_goodDataPath + '/' + file, 'r') as f:
                    #next(f)
                    dr = csv.DictReader(f)
                    to_dict = [(int(i['user_id']),i['signup_time'],i['purchase_time'],int(i['purchase_value']),i['device_id'],i['source'],i['browser'],i['sex'],int(i['age']),float(i['ip_address'])) for i in dr]
                try:
                    insert = """INSERT INTO MainFile_Good_Raw_Data VALUES (?,?,?,?,?,?,?,?,?,?);"""
                    c.executemany(insert,to_dict)
                    con.commit()
                    self.logger.log(log_file, "%s File Loaded Successfully in MainFile_Good_Raw_Data Table" % file)
                except Exception as e:
                    self.logger.log(log_file,'Error while Inserting into MainFile_Good_Raw_Data Table %s' % str(e))
            except Exception as e:
                    con.rollback()
                    self.logger.log(log_file,'Error while Inserting into MainFile_Good_Raw_Data %s' % str(e))
                    shutil.move(self.goodRaw_MainFile_path + '/' + file,self.badRaw_MainFile_path)
                    self.logger.log(log_file,'Main File Moved Successfully after Error in Insertion into Database')
                    log_file.close()
                    con.close()
        con.close()
        log_file.close()

    def InsertIntoTableGoodData_AdditionalFile(self,database):
        con = self.DatabaseConnection(database)
        c = con.cursor()
        AdditionalFile_goodDataPath = self.goodRaw_AdditionalFile_path
        only_files = [f for f in os.listdir(AdditionalFile_goodDataPath)]
        log_file = open("Prediction_Logs/DbInsertLog.txt", 'a+')
        for file in only_files:
            try:
                with open(AdditionalFile_goodDataPath + '/' + file, 'r') as f:
                    # next(f)
                    dr = csv.DictReader(f)
                    to_dict = [(float(i['lower_bound_ip_address']),float(i['upper_bound_ip_address']),i['country']) for i in dr]
                try:
                    insert = """INSERT INTO AdditionalFile_Good_Raw_Data VALUES (?,?,?);"""
                    c.executemany(insert, to_dict)
                    con.commit()
                    self.logger.log(log_file, "%s File Loaded Successfully in AdditionalFile_Good_Raw_Data Table" % file)
                except Exception as e:
                    self.logger.log(log_file, 'Error while Inserting into MainFile_Good_Raw_Data Table %s' % str(e))
            except Exception as e:
                con.rollback()
                self.logger.log(log_file, 'Error while Inserting into AdditionalFile_Good_Raw_Data %s' % str(e))
                shutil.move(self.goodRaw_AdditionalFile_path + '/' + file, self.badRaw_AdditionalFile_path)
                self.logger.log(log_file, 'Additional File Moved Successfully after Error in Insertion into Database')
                log_file.close()
                con.close()
        con.close()
        log_file.close()


    def SelectingDataFromTableIntoCSV_MainFile(self,database):
        self.TrainingfileFromDB_Dir ='PredictionFileFromDB'
        self.MainFilePath = 'MainFile'
        self.MainFile_Name = 'InputFile.csv'
        log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+')
        try:
            con = self.DatabaseConnection(database)
            sql_select = 'SELECT * FROM MainFile_Good_Raw_Data'
            cursor = con.cursor()

            cursor.execute(sql_select)
            results = cursor.fetchall()
            header =[i[0] for i in cursor.description]

            if not os.path.isdir(self.TrainingfileFromDB_Dir + '/' + self.MainFilePath):
                os.makedirs(os.path.join(self.TrainingfileFromDB_Dir,self.MainFilePath))

            csvFile = csv.writer(open(self.TrainingfileFromDB_Dir + '/' + self.MainFilePath + '/' + self.MainFile_Name,'w',newline=''),delimiter=',',
                                 lineterminator='\r\n',quoting=csv.QUOTE_ALL,escapechar='\\')
            csvFile.writerow(header)
            csvFile.writerows(results)
            self.logger.log(log_file,'MainFile Exported as .csv Format Successfully')
            log_file.close()
        except Exception as e:
            self.logger.log(log_file,'MainFile Exporting Failed:: %s' % e)
            log_file.close()

    def SelectingDataFromTableIntoCSV_AdditionalFile(self,database):
        self.TrainingfileFromDB_Dir ='PredictionFileFromDB'
        self.AdditionalFilePath = 'AdditionalFile'
        self.AdditionalFile_Name = 'AdditionalFile.csv'
        log_file = open("Prediction_Logs/ExportToCsv.txt", 'a+')
        try:
            con = self.DatabaseConnection(database)
            sql_select = 'SELECT * FROM AdditionalFile_Good_Raw_Data'
            cursor = con.cursor()

            cursor.execute(sql_select)
            results = cursor.fetchall()
            header =[i[0] for i in cursor.description]

            if not os.path.isdir(self.TrainingfileFromDB_Dir + '/' + self.AdditionalFilePath):
                os.makedirs(os.path.join(self.TrainingfileFromDB_Dir,self.AdditionalFilePath))

            csvFile = csv.writer(open(self.TrainingfileFromDB_Dir + '/' + self.AdditionalFilePath + '/' + self.AdditionalFile_Name, 'w', newline=''),
                delimiter=',',lineterminator='\r\n', quoting=csv.QUOTE_ALL, escapechar='\\')
            csvFile.writerow(header)
            csvFile.writerows(results)
            self.logger.log(log_file,'AdditionalFile Exported as .csv Format Successfully')
            log_file.close()
        except Exception as e:
            self.logger.log(log_file,'AdditionalFile Exporting Failed:: %s' % e)
            log_file.close()