class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Rajat Bisoi Version: 1.0 Revisions: None """ def __init__(self): # self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger('wafer') self.mongo = To_mongo_db() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. Written By: Rajat Bisoi Version: 1.0 Revisions: None """ # log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: # onlyfiles = [f for f in listdir(self.goodDataPath)] idx = self.mongo.Get_ID('wafer_good_data', 'temp_db') for file in idx: # csv = pandas.read_csv(self.goodDataPath+"/" + file) csv = self.mongo.downlaod_one_from_mongo( 'wafer_good_data', 'temp_db', file, initial_columnname='Wafer') csv.fillna('NULL', inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) csv['Wafer'] = csv['Wafer'].str[6:] # csv.to_csv(self.goodDataPath+ "/" + file, index=None, header=True) self.mongo.send_to_mongo('wafer_good_data', 'temp_db', csv, initial_columnname='Wafer') self.mongo.Delete_obj_in_collection('wafer_good_data', 'temp_db', file) self.logger.log( 'wafer_log', str(file).replace("'", "-") + " File Transformed successfully!!") # log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log( 'wafer_log', "Data Transformation failed because :" + str(e).replace("'", "-")) raise e
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Prediction_Good_Raw_Files_Validated" self.logger = App_Logger() self.awsObj = AwsStorageManagement() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. """ try: log_file = 'dataTransformLog' onlyfiles = self.awsObj.listDirFiles(self.goodDataPath) for file in onlyfiles: data = self.awsObj.csvToDataframe(self.goodDataPath, file) data['stalk-root'] = data['stalk-root'].replace('?', "'?'") self.awsObj.saveDataframeToCsv(self.goodDataPath, file, data) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = 'dataTransformLog' self.logger.log(log_file, "Data Transformation failed because:: %s" % e) raise e
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. """ log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransformPredict: def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in os.listdir(self.goodDataPath)] for file in onlyfiles: csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) csv['Wafer'] = csv['Wafer'].str[6:] csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: File Transformed successfully!!" % file) # log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) # log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Good_Raw_Files_Validated" self.logger = App_Logger() self.awsObj = AwsStorageManagement() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. """ log_file = 'addQuotesToStringValuesInColumn' try: onlyfiles = self.awsObj.listDirFiles(self.goodDataPath) for file in onlyfiles: data = self.awsObj.csvToDataframe(self.goodDataPath, file) for column in data.columns: count = data[column][data[column] == '?'].count() if count != 0: data[column] = data[column].replace('?', "'?'") self.awsObj.saveDataframeToCsv(self.goodDataPath, file, data) self.logger.log(log_file," %s: Quotes added successfully!!" % file) except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e)
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ This method replaces the missing values in columns with "NULL" to store in the table. :return: None """ log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) data.fillna("NULL", inplace=True) data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, "%s: Quotes added successfully !!!" % file) except Exception as e: self.logger.log(log_file, "Data Transform failed because:: %s" % file) log_file.close() log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) data.drop(columns=[ 'url', "address", "name", 'dish_liked', 'phone', 'reviews_list' ], inplace=True) columns = [ "online_order", "book_table", "rate", "location", "rest_type", "cuisines", "menu_item", "listed_in(type)", "listed_in(city)" ] for col in columns: data[col] = data[col].apply(lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. Version: 1.0 Revisions: None """ log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) # list of columns with string datatype variables columns = [ "policy_bind_date", "policy_state", "policy_csl", "insured_sex", "insured_education_level", "insured_occupation", "insured_hobbies", "insured_relationship", "incident_state", "incident_date", "incident_type", "collision_type", "incident_severity", "authorities_contacted", "incident_city", "incident_location", "property_damage", "police_report_available", "auto_make", "auto_model", "fraud_reported" ] for col in columns: data[col] = data[col].apply(lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) # list of columns with string datatype variables column = ['sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source', 'Class'] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply(lambda x: "'" + str(x) + "'") if col not in column: # add quotes to '?' values in integer/float columns data[col] = data[col].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) # csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Chethan D Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: Chethan D Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) #data = self.removeHyphenFromColumnNames(data) # for col in data.columns: # # if col in column: # add quotes in string value data['Item_Identifier'] = data["Item_Identifier"].apply( lambda x: "'" + str(x) + "'") data['Outlet_Identifier'] = data["Outlet_Identifier"].apply( lambda x: "'" + str(x) + "'") # if col not in column: # add quotes to '?' values in integer/float columns # for column in data.columns: # count = data[column][data[column] == '?'].count() # if count != 0: # data[column] = data[column].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) #list of columns with string datatype variables column = [ 'sex', 'on_thyroxine', 'query_on_thyroxine', 'on_antithyroid_medication', 'sick', 'pregnant', 'thyroid_surgery', 'I131_treatment', 'query_hypothyroid', 'query_hyperthyroid', 'lithium', 'goitre', 'tumor', 'hypopituitary', 'psych', 'TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured', 'FTI_measured', 'TBG_measured', 'TBG', 'referral_source', 'Class' ] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") if col not in column: # add quotes to '?' values in integer/float columns data[col] = data[col].replace('?', "'?'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) #csv['Wafer'] = csv['Wafer'].str[6:] data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) #dropping unnecessary column data.drop(columns=[ 'url', "address", "name", 'dish_liked', 'phone', 'reviews_list' ], inplace=True) columns = [ "online_order", "book_table", "rate", "location", "rest_type", "cuisines", "menu_item", "listed_in(type)", "listed_in(city)" ] for col in columns: data[col] = data[col].apply(lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) # list of columns with string datatype variables columns = [ 'Income', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country' ] for col in columns: data[col] = data[col].apply(lambda x: "'" + str(x) + "'") # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) # csv['Wafer'] = csv['Wafer'].str[6 data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) # list of columns with string datatype variables column = [ 'laundry_options', 'parking_options', 'lat', 'long', 'state', 'image_url', 'type', 'url', 'region', 'region_url' ] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) for column in data.columns: count = data[column][data[column] == 'na'].count() if count != 0: data[column] = data[column].replace('na', "'na'") # for column in data.columns: # count = data[column][data[column] == '?'].count() # if count != 0: # data[column] = data[column].replace('?', "'?'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: Ajinkya Abhang Version: 1.0 Revisions: None """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) #list of columns with string datatype variables column = [ 'laundry_options', 'parking_options', 'lat', 'long', 'state', 'image_url', 'type', 'url', 'region', 'region_url' ] for col in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: Krishna Nanda Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Written By: Krishna Nanda Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: csv = pandas.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) # #csv.update("'"+ csv['Wafer'] +"'") # csv.update(csv['Wafer'].astype(str)) csv['Wafer'] = csv['Wafer'].str[6:] csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: File Transformed successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class preprocessing_beforeDB: def __init__(self): self.goodData_path = "Training_Raw_Validated_File/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log(file,'Entered replaceMissingWithNull() method of preprocessing_beforeDB class of training_data_preprocessing_beforeDB package') file.close() try: f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+") only_files = [f for f in os.listdir(self.goodData_path)] for file in only_files: csv = pd.read_csv(self.goodData_path + "/" + file) csv.fillna('NULL',inplace=True) csv.to_csv(self.goodData_path + "/" + file,index=None,header=True) self.logger.log(f,'Replace Missing values with Null Values in Good Raw Main File Successfully !!') f.close() file = open('Training_Logs/General_Log.txt', 'a+') self.logger.log(file,'Successfully Executed replaceMissingWithNull() method of preprocessing_beforeDB class of training_data_preprocessing_beforeDB package') file.close() except Exception as e: f = open("Training_Logs/data_preprocessing_beforeDB.txt", "a+") self.logger.log(f,'Replace missing with Null Values failed in Main File becasue:: %s' % str(e)) f.close()
class dataTransform: """ This class shall be used for transforming the Training and new predicion Data before loading it in Database!!. """ def __init__(self): self.file_object = open("../logs/datatransform/log.txt", 'a+') self.logger = App_Logger() def trainingData(self): self.logger.log(self.file_object,'Entered the trainingData method of the dataTransform class') try: data_getter=readWriteOps.Data_Getter() data = data_getter.get_data() df_filter = data.iloc[:, 3:] oe = OrdinalEncoder(dtype=np.int32) df_1 = oe.fit_transform(df_filter[['Geography', 'Gender']]) df_2 = pd.DataFrame(data=df_1, columns=['Geography', 'Gender']) df_1= df_filter.drop(['Geography', 'Gender'], axis=1) df = pd.concat([df_2, df_1], axis=1) output = open('encoder.pkl', 'wb') pickle.dump(oe, output) output.close() self.logger.log(self.file_object,'Data transfomr Successful.Exited trainingData method of the dataTransform class') return df except Exception as e: self.logger.log(self.file_object, 'Exception occured in trainingData method of the dataTransform class. Exception message: '+str(e)) self.logger.log(self.file_object, 'ataTransform Unsuccessful.Exited the trainingData method of the dataTransform class')
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) data['Id'] = data["Id"].apply(lambda x: "'" + str(x) + "'") data['ActivityDate'] = data["ActivityDate"].apply( lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() raise e log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during prediction. Version: 1.0 Revisions: None """ try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + "/" + file) for cl in data.columns: if cl == 'Unnamed: 0': data.drop('Unnamed: 0', axis=1, inplace=True) data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() raise e log_file.close()
class dataTransformPredict: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. Written By: iNeuron Intelligence Version: 1.0 Revisions: None """ log_file = open("Prediction_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) data['class'] = data['class'].apply( lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) #log_file.write("Current Date :: %s" %date +"\t" + "Current time:: %s" % current_time + "\t \t" + + "\n") except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) #log_file.write("Current Date :: %s" %date +"\t" +"Current time:: %s" % current_time + "\t \t" + "Data Transformation failed because:: %s" % e + "\n") log_file.close() log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): """ Method Name: addQuotesToStringValuesInColumn Description: This method converts all the columns with string datatype such that each value for that column is enclosed in quotes. This is done to avoid the error while inserting string values in table as varchar. """ log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pd.read_csv(self.goodDataPath + "/" + file) for column in data.columns: if col in column: # add quotes in string value data[col] = data[col].apply( lambda x: "'" + str(x) + "'") if col not in column: # add quotes to '?' values in integer/float columns data[col] = data[col].replace('?', "'?'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() log_file.close()
class dataTransform: def __int__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def addQuotesToStringValuesInColumn(self): log_file = open("Training_Logs/addQuotesToStringValuesInColumn.txt", 'a+') try: onlyfiles = [f for f in os.listdir(self.goodDataPath)] for file in onlyfiles: data=pd.read_csv(self.goodDataPath + "/" + file) data['DATE'] = data["DATE"].apply(lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, " %s: Quotes added successfully!!" % file) except Exception as e: self.logger.log(log_file,"Data Transformation failed because:: %s" % e) log_file.close() log_file.close()
def train_and_evaluate(config_path): config = read_params(config_path) test_data_path = config["split_data"]["test_path"] train_data_path = config["split_data"]["train_path"] model_dir = config["model_dir"] file_object = open('Training_log.txt', 'a+') logger = App_Logger() df = pd.read_csv(train_data_path) #Reading the processed dataset df["date"] = pd.to_datetime(df["date"]).dt.date X_train = df[df['date'] <= datetime.date( 2017, 5, 31)] #splitting the dataset based on date for trainging data val_X = df[df['date'] > datetime.date( 2017, 5, 31)] #spliting the dataset based on date for validation data logger.log(file_object, "Splitting dataset completed") X_train = X_train.drop(['date'], axis=1) val_X = val_X.drop(['date'], axis=1) y_train = np.log1p((X_train["transactionRevenue"]).values) val_y = np.log1p((val_X["transactionRevenue"]).values) logger.log(file_object, "Log transformation of transaction Revenue values completed") x1 = X_train.drop(['transactionRevenue'], axis=1) val_x1 = val_X.drop(['transactionRevenue'], axis=1) y_train = pd.DataFrame(y_train) val_y = pd.DataFrame(val_y) ################## MLFLOW ###################### mlflow_config = config["mlflow_config"] remote_server_uri = mlflow_config['remote_server_uri'] mlflow.set_tracking_uri(remote_server_uri) mlflow.set_experiment(mlflow_config["experiment_name"]) with mlflow.start_run(run_name=mlflow_config["run_name"]) as mlops_run: model_xgb = run_xgb(x1, y_train) y_train_predict = model_xgb.predict(x1) rmse, mae, r2 = eval_metrics(y_train, y_train_predict) mlflow.log_param("n_estimators", 1200) mlflow.log_param("learning_rate", 0.5) mlflow.log_param("max_depth", 8) mlflow.log_metric('rmse', rmse) mlflow.log_metric("mae", mae) mlflow.log_metric("r2", r2) tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme if tracking_url_type_store != "file": mlflow.sklearn.log_model( model_xgb, "model", registered_model_name=mlflow_config["registered_model_name"]) else: mlflow.sklearn.load_model(model_xgb, "model") ##################### Saving the model as pickle file ################################ logger.log(file_object, "Model file created successfully") file_object.close()
class dataTransform: def __init__(self): self.goodDataPath = 'Training_Raw_files_validated/Good_Raw' self.logger = App_Logger() def replaceMissingWithNull(self): log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: csv = pd.read_csv(self.goodDataPath + "/" + file) csv.fillna('NULL', inplace=True) csv['Wafer'] = csv['Wafer'].str[6:] csv.to_csv(self.goodDataPath + "/" + file, index=None, header=True) self.logger.log(log_file, f"{file}: File Transformed successfully!!") except Exception as e: self.logger.log(log_file, f"Data Transformation Failed because:: {e}") log_file.close() log_file.close()
class dataTransform: """ This class shall be used for transforming the Good Raw Training Data before loading it in Database!!. """ def __init__(self): self.goodDataPath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def replaceMissingWithNull(self): """ Method Name: replaceMissingWithNull Description: This method replaces the missing values in columns with "NULL" to store in the table. We are using substring in the first column to keep only "Integer" data for ease up the loading. This column is anyways going to be removed during training. """ log_file = open("Training_Logs/dataTransformLog.txt", 'a+') try: onlyfiles = [filename for filename in listdir(self.goodDataPath)] for filename in onlyfiles: df = pandas.read_csv(self.goodDataPath + "/" + filename) df.fillna('NULL', inplace=True) df['Wafer'] = df['Wafer'].str[6:] df.to_csv(self.goodDataPath + "/" + filename, index=None, header=True) self.logger.log( log_file, " %s: File Transformed successfully!!" % filename) except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() log_file.close()
class dataTransformPredict: def __init__(self): self.goodDataPath = "Prediction_Raw_Files_Validated/Good_Raw" self.logger = App_Logger() def replaceSingleQuotesToDouble(self): try: log_file = open("Prediction_Logs/dataTransformLog.txt", 'a+') onlyfiles = [f for f in listdir(self.goodDataPath)] for file in onlyfiles: data = pandas.read_csv(self.goodDataPath + '/' + file) columns = [ "policy_bind_date", "policy_state", "policy_csl", "insured_sex", "insured_education_level", "insured_occupation", "insured_hobbies", "insured_relationship", "incident_state", "incident_date", "incident_type", "collision_type", "incident_severity", "authorities_contacted", "incident_city", "incident_location", "property_damage", "police_report_available", "auto_make", "auto_model" ] for col in columns: data[col] = data[col].apply(lambda x: "'" + str(x) + "'") data.to_csv(self.goodDataPath + '/' + file, index=None, header=True) self.logger.log(log_file, " %s: File Transformed successfully!!" % file) except Exception as e: self.logger.log(log_file, "Data Transformation failed because:: %s" % e) log_file.close() raise e log_file.close()
class Data_Getter: """ This class shall be used for obtaining the data from the source for training. Written By: Piyush Version: 1.0 Revisions: None """ def __init__(self): self.training_file='../training_file/Churn_Modelling.csv' self.file_object = open("../logs/filereadlogs/log.txt", 'a+') self.logger = App_Logger() def get_data(self): """ Method Name: get_data Description: This method reads the data from source. Output: A pandas DataFrame. On Failure: Raise Exception Written By: Piyush Version: 1.0 Revisions: None """ self.logger.log(self.file_object,'Entered the get_data method of the Data_Getter class') try: self.data= pd.read_csv(self.training_file) # reading the data file self.logger.log(self.file_object,'Data Load Successful.Exited the get_data method of the Data_Getter class') return self.data except Exception as e: self.logger.log(self.file_object,'Exception occured in get_data method of the Data_Getter class. Exception message: '+str(e)) self.logger.log(self.file_object, 'Data Load Unsuccessful.Exited the get_data method of the Data_Getter class') raise Exception()
class dBOperation: """ This class shall be used for handling all the SQL operations. Written By: Chethan D Version: 1.0 Revisions: None """ def __init__(self): self.path = 'Training_Database/' self.badFilePath = "Training_Raw_files_validated/Bad_Raw" self.goodFilePath = "Training_Raw_files_validated/Good_Raw" self.logger = App_Logger() def dataBaseConnection(self,DatabaseName): """ Method Name: dataBaseConnection Description: This method creates the database with the given name and if Database already exists then opens the connection to the DB. Output: Connection to the DB On Failure: Raise ConnectionError Written By: Chethan D Version: 1.0 Revisions: None """ try: conn = sqlite3.connect(self.path+DatabaseName+'.db') file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Opened %s database successfully" % DatabaseName) file.close() except ConnectionError: file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Error while connecting to database: %s" %ConnectionError) file.close() raise ConnectionError return conn def createTableDb(self,DatabaseName,column_names): """ Method Name: createTableDb Description: This method creates a table in the given database which will be used to insert the Good data after raw data validation. Output: None On Failure: Raise Exception Written By: Chethan D Version: 1.0 Revisions: None """ try: conn = self.dataBaseConnection(DatabaseName) c=conn.cursor() c.execute("SELECT count(name) FROM sqlite_master WHERE type = 'table'AND name = 'Good_Raw_Data'") if c.fetchone()[0] ==1: conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() else: for key in column_names.keys(): type = column_names[key] #in try block we check if the table exists, if yes then add columns to the table # else in catch block we will create the table try: #cur = cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Good_Raw_Data'".format(dbName=DatabaseName)) conn.execute('ALTER TABLE Good_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type)) except: conn.execute('CREATE TABLE Good_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) # try: # #cur.execute("SELECT name FROM {dbName} WHERE type='table' AND name='Bad_Raw_Data'".format(dbName=DatabaseName)) # conn.execute('ALTER TABLE Bad_Raw_Data ADD COLUMN "{column_name}" {dataType}'.format(column_name=key,dataType=type)) # # except: # conn.execute('CREATE TABLE Bad_Raw_Data ({column_name} {dataType})'.format(column_name=key, dataType=type)) conn.close() file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Tables created successfully!!") file.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() except Exception as e: file = open("Training_Logs/DbTableCreateLog.txt", 'a+') self.logger.log(file, "Error while creating table: %s " % e) file.close() conn.close() file = open("Training_Logs/DataBaseConnectionLog.txt", 'a+') self.logger.log(file, "Closed %s database successfully" % DatabaseName) file.close() raise e def insertIntoTableGoodData(self,Database): """ Method Name: insertIntoTableGoodData Description: This method inserts the Good data files from the Good_Raw folder into the above created table. Output: None On Failure: Raise Exception Written By: Chethan D Version: 1.0 Revisions: None """ conn = self.dataBaseConnection(Database) goodFilePath= self.goodFilePath badFilePath = self.badFilePath onlyfiles = [f for f in listdir(goodFilePath)] log_file = open("Training_Logs/DbInsertLog.txt", 'a+') for file in onlyfiles: try: with open(goodFilePath+'/'+file, "r") as f: next(f) reader = csv.reader(f, delimiter="\n") for line in enumerate(reader): for list_ in (line[1]): try: conn.execute('INSERT INTO Good_Raw_Data values ({values})'.format(values=(list_))) self.logger.log(log_file," %s: File loaded successfully!!" % file) conn.commit() except Exception as e: raise e except Exception as e: conn.rollback() self.logger.log(log_file,"Error while creating table: %s " % e) shutil.move(goodFilePath+'/' + file, badFilePath) self.logger.log(log_file, "File Moved Successfully %s" % file) log_file.close() conn.close() conn.close() log_file.close() def selectingDatafromtableintocsv(self,Database): """ Method Name: selectingDatafromtableintocsv Description: This method exports the data in GoodData table as a CSV file. in a given location. above created . Output: None On Failure: Raise Exception Written By: Chethan D Version: 1.0 Revisions: None """ self.fileFromDb = 'Training_FileFromDB/' self.fileName = 'InputFile.csv' log_file = open("Training_Logs/ExportToCsv.txt", 'a+') try: conn = self.dataBaseConnection(Database) sqlSelect = "SELECT * FROM Good_Raw_Data" cursor = conn.cursor() cursor.execute(sqlSelect) results = cursor.fetchall() # Get the headers of the csv file headers = [i[0] for i in cursor.description] #Make the CSV ouput directory if not os.path.isdir(self.fileFromDb): os.makedirs(self.fileFromDb) # Open CSV file for writing. csvFile = csv.writer(open(self.fileFromDb + self.fileName, 'w', newline=''),delimiter=',', lineterminator='\r\n',quoting=csv.QUOTE_ALL, escapechar='\\') # Add the headers and data to the CSV file. csvFile.writerow(headers) csvFile.writerows(results) self.logger.log(log_file, "File exported successfully!!!") log_file.close() except Exception as e: self.logger.log(log_file, "File exporting failed. Error : %s" %e) log_file.close()