예제 #1
0
    def trainingModel(self):
        # Logging start of the training
        self.loggerObj.logger_log("Start of TrainingModel")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.loggerObj)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.loggerObj)

            # repalcing '?' values with np.nan as discussed in the EDA part
            data = preprocessor.replaceInvalidValuesWithNull(data)

            # Drop the columns which are having missing values more than 50% of total observations
            data = preprocessor.dropUnnecessaryColumns(data)

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(data)

            # if missing values are there, impute them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, y = preprocessor.separate_label_feature(
                data, label_column_name='classes')

            # Categorical encoding
            X, y = preprocessor.encodeCategoricalValues(X, y)

            # Handling imbalance dataset using SMOTE
            #X, y = preprocessor.handleImbalanceDataset(X, y)

            # splitting the data into training and test set
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.3, random_state=44)

            # Standardisation of X_train data
            X_train = preprocessor.data_standardisation(X_train)

            # applying same standardisation object on X_test data
            X_test = preprocessor.prediction_data_standardisation(X_test)

            model_finder = tuner.Model_Finder(
                self.loggerObj)  # object initialization

            # getting the best model for each of the clusters
            best_model_name, best_model = model_finder.get_best_model(
                X_train, y_train, X_test, y_test)

            # saving the best model to the directory.
            file_op = file_methods.File_Operation(self.loggerObj)
            save_model = file_op.save_model(best_model, best_model_name)

        except Exception:
            # Logging the unsuccessful training
            self.loggerObj.logger_log("Unsuccessful end of training")
            raise Exception
    def trainingModel(self,csv_data="old_data",file_object=open("log_file/ModelTraining_log.txt", 'a+')):
        # Logging the start of Training
        self.log_writer.log(file_object, '=========== Start of Training =============')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(file_object, self.log_writer)
            if (csv_data=="old_data"):
                data = data_getter.get_data()
            else:
                data = data_getter.get_data(csv_data)
            """ doing the data preprocessing . 
            All the pre processing steps are based on the EDA done previously
            """
            """
            1. Initializing columns
            2. Changing null values to Other category in Condtion column
            3. Null removal
            4. Removing stopwords 
            5. Adding important Features 
            6. Vectorization 
            """
            # initializing preprocessor class
            preprocessor = preprocessing.Preprocessor(file_object, self.log_writer)
            # initializing columns in data
            data = preprocessor.initialize_columns(data)
            # replacing missing values in condition feature in data
            data = preprocessor.replace_missing_in_condition(data)
            # removing rows containing null values
            data = preprocessor.remove_null_values(data)
            # seperating important features
            new_data = preprocessor.separate_imp_feature(data)
            # removing stopwords from review column
            new_data = preprocessor.remove_stopwords(new_data)
            # adding new features n scores for better results
            new_data = preprocessor.adding_new_features(new_data)
            # vectorizing the dataset into features and labels
            features, labels, tfidf = preprocessor.vectorizor(new_data)

            model = LinearSVC('l2')
            x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=0)
            x_train = self.normalize.fit_transform(x_train)
            x_test = self.normalize.transform(x_test)
            model.fit(x_train, y_train)

            # saving the data n model in pickle files ..
            import pickle
            file = open('pickle_files/drug_LinearSVC.pkl', 'wb')
            pickle.dump(model, file)

            file = open('pickle_files/d_transform.pkl', 'wb')
            pickle.dump(tfidf, file)

            file = open('pickle_files/drug_LinearSVC.pkl', 'rb')
            ml = pickle.load(file)
            t = pickle.load(open('pickle_files/d_transform.pkl', 'rb'))
            self.log_writer.log(file_object, '=========== Training Succesfull =============')
        except Exception as e:
            self.log_writer.log(file_object,
                                'Exception occured in trainingModel method of the trainModel class. Exception message:  ' + str(e))
            raise Exception()
예제 #3
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object,'Start of Prediction')
            data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer)
            data=data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer)
            data = preprocessor.remove_columns(data, [
                'education'])  # remove the column as it doesn't contribute to prediction.
            data = preprocessor.remove_unwanted_spaces(data)  # remove unwanted spaces from the dataframe
            data.replace('?', np.NaN, inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(data, cols_with_missing_values)  # missing value imputation

            # Proceeding with more data pre-processing steps
            scaled_num_df = preprocessor.scale_numerical_columns(data)
            cat_df = preprocessor.encode_categorical_columns(data)
            X = pd.concat([scaled_num_df, cat_df], axis=1)


            file_loader=file_methods.File_Operation(self.file_object,self.log_writer)
            kmeans=file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters=kmeans.predict(X)#drops the first column for cluster prediction
            X['clusters']=clusters
            clusters=X['clusters'].unique()
            predictions=[]
            for i in clusters:
                cluster_data= X[X['clusters']==i]
                cluster_data = cluster_data.drop(['clusters'],axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result=(model.predict(cluster_data))
                for res in result:
                    if res==0:
                        predictions.append('<=50K')
                    else:
                        predictions.append('>50K')

            final= pd.DataFrame(list(zip(predictions)),columns=['Predictions'])
            path="Prediction_Output_File/Predictions.csv"
            final.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file
            self.log_writer.log(self.file_object,'End of Prediction')
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            is_null_present = preprocessor.is_null_present(data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                data)
            data = preprocessor.remove_columns(data, cols_to_drop)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(data.drop(
                ['Wafer'],
                axis=1))  #drops the first column for cluster prediction
            data['clusters'] = clusters
            data.to_csv(
                'Prediction_Raw_Files_Validated/finalpredictiondata.csv',
                index=False)
            clusters = data['clusters'].unique()
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                wafer_names = list(cluster_data['Wafer'])
                cluster_data = data.drop(labels=['Wafer'], axis=1)
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = list(model.predict(cluster_data))
                result = pandas.DataFrame(list(zip(wafer_names, result)),
                                          columns=['Wafer', 'Prediction'])
                path = "Prediction_Output_File/Predictions.csv"
                result.to_csv("Prediction_Output_File/Predictions.csv",
                              header=True,
                              mode='a+')  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, result.head().to_json(orient="records")
예제 #5
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.scaleData(data)

            #data = preprocessor.enocdeCategoricalvalues(data)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(
                data)  #drops the first column for cluster prediction
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            result = []
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data)):
                    if val == 0:
                        result.append("Lodgepole_Pine")
                    elif val == 1:
                        result.append("Spruce_Fir")
                    elif val == 2:
                        result.append("Douglas_fir")
                    elif val == 3:
                        result.append("Krummholz")
                    elif val == 4:
                        result.append("Ponderosa_Pine")
                    elif val == 5:
                        result.append("Aspen")
                    elif val == 6:
                        result.append("Cottonwood_Willow")
            result = pandas.DataFrame(result, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True,
                          mode='a+')  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')

        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
예제 #6
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            data = preprocessor.logTransformation(data)
            # print(data)

            # #scale the prediction data
            data_scaled = pandas.DataFrame(
                preprocessor.standardScalingData(data), columns=data.columns)

            # #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(
                data_scaled)  #drops the first column for cluster prediction
            data_scaled['clusters'] = clusters
            clusters = data_scaled['clusters'].unique()
            result = []  # initialize blank list for storing predicitons

            for i in clusters:
                cluster_data = data_scaled[data_scaled['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True)  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')

        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex

        return path
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  # deletes the existing file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            #if missing values are there,  replace them appropriately.
            if is_null_present:
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  #missing value imputation

            # Proceeding with more data pre-processing steps
            X = preprocessor.scale_numerical_columns(data)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(
                X)  # drops the first column for cluster prediction
            X['clusters'] = clusters
            clusters = X['clusters'].unique()

            prediction = []
            for i in clusters:
                cluster_data = X[X['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = model.predict(cluster_data)

            final = pd.DataFrame(
                list(zip(range(X.shape[0]), result)),
                columns=['Customer No.', 'Predictions'],
            )
            path = "Prediction_Output_File/Predictions.csv"
            final.to_csv("Prediction_Output_File/Predictions.csv",
                         header=True,
                         mode='a+')  #append result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, final.head().to_json(orient='records')
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            #cdrop 'Unnamed: 0'
            for cl in data.columns:
                if cl == 'Unnamed: 0':
                    data.drop('Unnamed: 0', axis=1, inplace=True)

            # Dropping column after performing EDA
            preprocessor_cus = preprocess_cus.Preprocessor_cus(
                self.file_object, self.log_writer)
            data = preprocessor_cus.drop_column(data)

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            # replacing '?' values with np.nan as discussed in the EDA part
            data = preprocessor.replaceInvalidValuesWithNull(data)

            # get encoded values for categorical data
            data = preprocessor_cus.test_data_encode(data)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)

            result = []  # initialize balnk list for storing predicitons

            model = file_loader.load_model('CatBoost')
            for val in (model.predict(data)):
                result.append(val)

            result = pandas.DataFrame(result, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            result['Predictions'].replace({0: "no", 1: "yes"}, inplace=True)
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True)  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object,'Start of Prediction')
            data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer)
            data=data_getter.get_data()


            preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer)
            data = preprocessor.remove_columns(data,[])  # remove the column as it doesn't contribute to prediction.
            data.replace('?', np.NaN, inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(data, cols_with_missing_values)  # missing value imputation
            # encode categorical data
            #data = preprocessor.encode_categorical_columns(data)
            df=data.copy()
            df.drop(labels=['Sex'],axis=1,inplace=True)

            file_loader = file_methods.File_Operation(self.file_object, self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed

            clusters=kmeans.predict(df)
            data['clusters']=clusters
            data = preprocessor.encode_categorical_columns(data)
            clusters=data['clusters'].unique()
            predictions=[]
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'],axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = (model.predict(np.array(cluster_data)))
                for res in result:
                    if res == 0:
                        predictions.append('1-8 Rings')
                    elif res == 1:
                        predictions.append('11+ Rings')
                    else:
                        predictions.append('9-10 Rings')

            final= pd.DataFrame(list(zip(predictions)),columns=['Predictions'])
            path="Prediction_Output_File/Predictions.csv"
            final.to_csv("Prediction_Output_File/Predictions.csv",header=True,mode='a+') #appends result to prediction file
            self.log_writer.log(self.file_object,'End of Prediction')
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path , final
예제 #10
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object,'Start of Prediction')
            data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer)
            data=data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
            data = preprocessor.dropUnnecessaryColumns(data,
                                                       ['TSH_measured', 'T3_measured', 'TT4_measured', 'T4U_measured',
                                                        'FTI_measured', 'TBG_measured', 'TBG', 'TSH'])

            # replacing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValuesPrediction(data)
            is_null_present=preprocessor.is_null_present(data)
            if(is_null_present):
                data=preprocessor.impute_missing_values(data)

            #data=data.to_numpy()
            file_loader=file_methods.File_Operation(self.file_object,self.log_writer)
            kmeans=file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters=kmeans.predict(data)#drops the first column for cluster prediction
            data['clusters']=clusters
            clusters=data['clusters'].unique()
            result=[] # initialize balnk list for storing predicitons
            with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values
                encoder = pickle.load(file)

            for i in clusters:
                cluster_data= data[data['clusters']==i]
                cluster_data = cluster_data.drop(['clusters'],axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (encoder.inverse_transform(model.predict(cluster_data))):
                    result.append(val)
            result = pandas.DataFrame(result,columns=['Predictions'])
            path="Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file
            self.log_writer.log(self.file_object,'End of Prediction')
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
예제 #11
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object,'Start of Prediction')
            data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer)
            data=data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
            data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"])


            is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)
            if(is_null_present):
                data=data.dropna(how='any')


            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValues(data)
            #scale the prediction data
            data_scaled = pandas.DataFrame(preprocessor.standardScalingData(data),columns=data.columns)

            #data=data.to_numpy()
            file_loader=file_methods.File_Operation(self.file_object,self.log_writer)
            kmeans=file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters=kmeans.predict(data_scaled)#drops the first column for cluster prediction
            data_scaled['clusters']=clusters
            clusters=data_scaled['clusters'].unique()
            result=[] # initialize blank list for storing predicitons
            # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values
            #     encoder = pickle.load(file)

            for i in clusters:
                cluster_data= data_scaled[data_scaled['clusters']==i]
                cluster_data = cluster_data.drop(['clusters'],axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)
            result = pandas.DataFrame(result,columns=['Predictions'])
            path="Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",header=True) #appends result to prediction file
            self.log_writer.log(self.file_object,'End of Prediction')
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            data = preprocessor.dropUnnecessaryColumns(data, [
                'id', 'region', 'url', 'region_url', 'image_url', 'state',
                'type', 'dogs_allowed'
            ])

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValuesPrediction(data)

            data_scaled = pandas.DataFrame(
                preprocessor.standardScalingData(data), columns=data.columns)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed
            clusters = kmeans.predict(
                data_scaled)  #drops the first column for cluster prediction
            data_scaled['clusters'] = clusters
            clusters = data_scaled['clusters'].unique()
            result = []  # initialize balnk list for storing predicitons
            for i in clusters:
                cluster_data = data_scaled[data_scaled['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Prediction'])
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True,
                          mode='a+')  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, result.head().to_json(orient="records")
    def predictionFromModel(self, singlerecdata=None):

        try:
            self.loggerObj.logger_log('Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.loggerObj)
            if singlerecdata is None:
                data = data_getter.get_data()
            else:
                data = data_getter.get_data_for_rec(singlerecdata)

            preprocessor = preprocessing.Preprocessor(self.loggerObj)

            # repalcing '?' values with np.nan as discussed in the EDA part
            data = preprocessor.replaceInvalidValuesWithNull(data)

            data = preprocessor.dropUnnecessaryColumnsForPrediction(data)

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            data = preprocessor.encodeCategoricalValuesPrediction(data)

            data = preprocessor.prediction_data_standardisation(data)

            file_loader = file_methods.File_Operation(self.loggerObj)
            model_name = file_loader.find_model_file()
            model = file_loader.load_model(model_name)

            result = []  # initialize balnk list for storing predicitons
            with open(
                    'EncoderPickle/enc.pickle', 'rb'
            ) as file:  # let's load the encoder pickle file to decode the values
                encoder = pickle.load(file)
            if singlerecdata is None:
                for val in (encoder.inverse_transform(model.predict(data))):
                    result.append(val)
                result = pandas.DataFrame(result, columns=['Predictions'])
                path = "Prediction_Output_File/Predictions.csv"
                result.to_csv("Prediction_Output_File/Predictions.csv",
                              header=True)  # appends result to prediction file
                self.loggerObj.logger_log('End of Prediction')
                return path
            else:
                val = encoder.inverse_transform(model.predict(data))
                self.loggerObj.logger_log('End of Prediction')
                return val

        except Exception as ex:
            self.loggerObj.logger_log(
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile() #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object,'Start of Prediction')
            data_getter=data_loader_prediction.Data_Getter_Pred(self.file_object,self.log_writer)
            data=data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)
            is_null_present=preprocessor.is_null_present(data)
            if(is_null_present):
                data=preprocessor.impute_missing_values(data)

            cols_to_drop=preprocessor.get_columns_with_zero_std_deviation(data)
            data=preprocessor.remove_columns(data,cols_to_drop)
            #data=data.to_numpy()
            file_loader=file_methods.File_Operation(self.file_object,self.log_writer)
            kmeans=file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters=kmeans.predict(data.drop(['Wafer'],axis=1))#drops the first column for cluster prediction
            data['clusters']=clusters
            clusters=data['clusters'].unique()
            for i in clusters:
                #selecting all the records of a perticular cluster type
                cluster_data= data[data['clusters']==i]
                #getting all the wafer names
                wafer_names = list(cluster_data['Wafer'])
                #dropping wafer and clusters columns
                cluster_data = data.drop(['Wafer','clusters'],axis=1)
                #finding the model name for that cluster
                model_name = file_loader.find_correct_model_file(i)
                #loading the model using the model name
                model = file_loader.load_model(model_name)
                #these are the predicted values 
                pred_values = list(model.predict(cluster_data))
                #creating a dataframe with wafernames and predictions
                result = pandas.DataFrame(list(zip(wafer_names,pred_values)),columns=['Wafer','Prediction'])
                #path to save the dataframe as csv file
                path = "Prediction_Output_File/Predictions.csv"
                #writing to csv files
                result.to_csv(path,header=True,mode='a+') #appends result to prediction file
            self.log_writer.log(self.file_object,'End of Prediction')
        except Exception as ex:
            self.log_writer.log(self.file_object, 'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, result.head().to_json(orient="records")
예제 #15
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            data.replace(-1, 1, inplace=True)
            new_data = data[[
                'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59',
                'A91'
            ]]

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(
                new_data)  #drops the first column for cluster prediction
            new_data['clusters'] = clusters
            clusters = new_data['clusters'].unique()
            result = []  # initialize balnk list for storing predicitons

            for i in clusters:
                cluster_data = new_data[new_data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Prediction'])
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True,
                          mode='a+')  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path, result.head().to_json(orient="records")
    def predictionFromModel(self):
        try:
            self.pred_data_val.deletePredictionFile()
            self.log_writer.log(self.file_object, "Start of Prediction!!")
            data_getter = Data_Getter_Pred(self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            data = preprocessor.replaceInvalidValuesWithNull(data)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(data)
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            results = []

            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data)):
                    results.append(val)
            results = pd.DataFrame(results, columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            results.to_csv("Prediction_Output_File/Predictions.csv",
                           header=True)
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as e:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % e)
            raise e
        return path
예제 #17
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            #data  = preprocessor.logTransformation(data)

            #encode the prediction data
            data_scaled = preprocessor.encodeCategoricalValuesPrediction(data)
            ###Time features
            data = preprocessor.create_timefeatures(data)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)

            model = file_loader.load_model('XGBOOST')
            result.model.predict(data)

            result = pandas.DataFrame(result, columns=['Predictions'])
            result['Item_Identifier'] = data["Item_Identifier"]
            result["Outlet_Identifier"] = data["Outlet_Identifier"]
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True)  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='default payment next month')

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X, cols_with_missing_values)  # missing value imputation
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                # Proceeding with more data pre-processing steps
                train_x = preprocessor.scale_numerical_columns(x_train)
                test_x = preprocessor.scale_numerical_columns(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    train_x, y_train, test_x, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
예제 #19
0
    def predictionFromModel(self):
        try:
            self.pred_data_val.deletePredictionFile()
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(data, [
                'policy_number', 'policy_bind_date', 'policy_state',
                'insured_zip', 'incident_location', 'incident_date',
                'incident_state', 'incident_city', 'insured_hobbies',
                'auto_make', 'auto_model', 'auto_year', 'age',
                'total_claim_amount'
            ])
            data.replace('?', np.NaN, inplace=True)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)

            data = preprocessor.encode_categorical_columns(data)
            data = preprocessor.scale_numerical_columns(data)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            clusters = kmeans.predict(data)
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            predictions = []
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = (model.predict(cluster_data))
                for res in result:
                    if res == 0:
                        predictions.append('N')
                    else:
                        predictions.append('Y')
            final = pd.DataFrame(list(zip(predictions)),
                                 columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            final.to_csv("Prediction_Output_File/Predictions.csv",
                         header=True,
                         mode='a+')
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
예제 #20
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, [
                'DATE', 'Precip', 'WETBULBTEMPF', 'DewPointTempF',
                'StationPressure'
            ])

            #repalcing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='VISIBILITY')

            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
예제 #21
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data=data_getter.get_data()


            """ doing the data preprocessing. 
            All the pre processing steps are based on the EDA done previously
            """
            """
            1. Duplicate
            2. Remove columns: 	"serial","rate","listed_in(type)","listed_in(city)"
            3. Null removal
            4. Convert cost column to number
            5. Categorical to Numerical
            """

            preprocessor=preprocessing.Preprocessor(self.file_object,self.log_writer)


            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data,["serial","rate","listed_in(type)","listed_in(city)"])

            # removing the duplicates
            data=preprocessor.removeDuplicates(data)


            # check if missing values are present in the dataset
            is_null_present,cols_with_missing_values=preprocessor.is_null_present(data)

            # if missing values are there, replace them appropriately.
            if(is_null_present):
                # here we won't do any imputation, just to show one more way, we'll drop the missing values
                data=data.dropna(how='any')

            # cost value to float
            data=preprocessor.convertCostToNumber(data)

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data, label_column_name='approx_cost(for two people)')


            """ Applying the clustering approach"""

            kmeans=clustering.KMeansClustering(self.file_object,self.log_writer) # object initialization.
            number_of_clusters=kmeans.elbow_plot(X)  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X=kmeans.create_clusters(X,number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels']=Y

            # getting the unique clusters from our dataset
            list_of_clusters=X['Cluster'].unique()

            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data=X[X['Cluster']==i] # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features=cluster_data.drop(['Labels','Cluster'],axis=1)
                cluster_label= cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3, random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder=tuner.Model_Finder(self.file_object,self.log_writer) # object initialization

                #getting the best model for each of the clusters
                best_model_name,best_model=model_finder.get_best_model(x_train_scaled,y_train,x_test_scaled,y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,self.log_writer)
                save_model=file_op.save_model(best_model,best_model_name+str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception as e:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
            self.file_object.close()
            raise e
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log('wafer_log', 'Start of Training')
        try:
            # Getting the data from the source
            # data_getter=data_loader.Data_Getter(self.file_object,self.log_writer)
            data = self.mongo.downlaod_all_from_mongo('wafer_good_data',
                                                      'temp_db')
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor('wafer_log',
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the wafer column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Good/Bad')

            # check if missing values are present in the dataset

            # if missing values are there, replace them appropriately.
            X.replace(to_replace='NULL', value=np.nan,
                      inplace=True)  # consumes  4 sec to compute
            is_null_present = preprocessor.is_null_present(X)
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(
                X)  # consumes a lot of time
            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                'wafer_log', self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.

            # X=pd.DataFrame.join(X,Y)
            X['Labels'] = Y.values

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for index, i in enumerate(list_of_clusters):
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    'wafer_log', self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                # file_op = file_methods.File_Operation('wafer_log',self.log_writer)
                # save_model=file_op.save_model(best_model,best_model_name+str(i))
                print(best_model)
                best_model = pickle.dumps(best_model)
                self.aws.Upload_To_S3_obj(best_model,
                                          best_model_name + str(index) +
                                          '.sav',
                                          bucket_prefix='wafer-model')

            # logging the successful Training
            self.log_writer.log('wafer_log', 'Successful End of Training')
            # self.file_object.close()

        except Exception as err:
            # logging the unsuccessful Training
            self.log_writer.log('wafer_log', 'Unsuccessful End of Training')
            # self.file_object.close()
            print(str(err))
            raise err
예제 #23
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            self.log_writer.log(self.file_object, 'Starting of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            print(data.head())
            """doing the data preprocessing as dicussed in EDA"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.
            data = preprocessor.binning(data)
            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, ['Ageband'])
            #print(data.isnull().sum())
            data = preprocessor.combiningfornewfeature(data)
            data = preprocessor.dropUnnecessaryColumns(
                data, ['Parch', 'Sibsp', 'FamilySize', 'Pid'])

            data = preprocessor.convertCategoricalfeatureIntonumeric(data)

            data = preprocessor.binningfare(data)
            data = preprocessor.dropUnnecessaryColumns(data, ['FareBand'])
            print(data.head())
            #print(data.isnull().sum())

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Survived')
            print(Y)
            #We donot need to encode any value as we have opted Binning in this case.
            #All data is fine and ready to scaling/Modeling.
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            #for i in list_of_clusters:

            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=0.33, random_state=36)
            x_train_scaled = preprocessor.standardScalingData(x_train)
            x_test_scaled = preprocessor.standardScalingData(x_test)

            model_finder = tuner.Model_Finder(
                self.file_object, self.log_writer)  # object initialization
            #getting the best model.
            best_model_name, best_model, prediction, acc = model_finder.get_best_model(
                x_train_scaled, y_train, x_test_scaled, y_test)
            #saving the best model to the directory.
            print("Predictions:")
            print(prediction)
            print("Accuracy:")
            print(acc)

            file_op = file_methods.File_Operation(self.file_object,
                                                  self.log_writer)
            self.log_writer.log(self.file_object, 'Going to create directory')

            #save_model=file_op.save_model(best_model,best_model_name+str(i))
            save_model = file_op.save_model(best_model, best_model_name)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)
            if (is_null_present):
                data = preprocessor.impute_missing_values(data)

            #data  = preprocessor.logTransformation(data)
            cols_to_drop = ["Item_Identifier", "Outlet_Identifier"]
            data_useful = preprocessor.remove_columns(data, cols_to_drop)
            #scale the prediction data
            data_scaled = preprocessor.scale_numerical_columns(data_useful)

            #data=data.to_numpy()
            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed
            #pred_data = data.drop(['Wafer'],axis=1)
            clusters = kmeans.predict(
                data_scaled)  #drops the first column for cluster prediction
            data_scaled['clusters'] = clusters
            clusters = data_scaled['clusters'].unique()
            result = []  # initialize blank list for storing predicitons
            # with open('EncoderPickle/enc.pickle', 'rb') as file: #let's load the encoder pickle file to decode the values
            #     encoder = pickle.load(file)

            for i in clusters:
                cluster_data = data_scaled[data_scaled['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                for val in (model.predict(cluster_data.values)):
                    result.append(val)
            result = pandas.DataFrame(result, columns=['Predictions'])
            result['Item_Identifier'] = data["Item_Identifier"]
            result["Outlet_Identifier"] = data["Outlet_Identifier"]
            path = "Prediction_Output_File/Predictions.csv"
            result.to_csv("Prediction_Output_File/Predictions.csv",
                          header=True)  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
예제 #25
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data,
                                                       label_column_name='A1')

            new_X = X[[
                'H18', 'F76', 'F46', 'G57', 'C13', 'A71', 'E115', 'F56', 'I59',
                'A91'
            ]]
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                new_X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(new_X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            new_X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = new_X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = new_X[new_X['Cluster'] ==
                                     i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(data, ['id'])

            #removing outliers from columns like height, weight, ap_hi, ap_lo
            data = preprocessor.dropOutliers(data)

            #processing gender and age columns and add new column BMI as discussed in the EDA part
            data = preprocessor.dataProcessor(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='cardio')
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                #To handle Imbalance dataset.
                rdsmple = RandomOverSampler()
                x_sampled, y_sampled = rdsmple.fit_sample(
                    cluster_features, cluster_label)

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    x_sampled, y_sampled, test_size=1 / 3, random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
예제 #27
0
    def predictionFromModel(self):

        try:
            self.pred_data_val.deletePredictionFile(
            )  #deletes the existing prediction file from last run!
            self.log_writer.log(self.file_object, 'Start of Prediction')
            data_getter = data_loader_prediction.Data_Getter_Pred(
                self.file_object, self.log_writer)
            data = data_getter.get_data()

            #code change
            # wafer_names=data['Wafer']
            # data=data.drop(labels=['Wafer'],axis=1)

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(data, [
                'policy_number', 'policy_bind_date', 'policy_state',
                'insured_zip', 'incident_location', 'incident_date',
                'incident_state', 'incident_city', 'insured_hobbies',
                'auto_make', 'auto_model', 'auto_year', 'age',
                'total_claim_amount'
            ])  # remove the column as it doesn't contribute to prediction.
            data.replace(
                '?', np.NaN,
                inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation
            # encode categorical data
            data = preprocessor.encode_categorical_columns(data)
            data = preprocessor.scale_numerical_columns(data)

            file_loader = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
            kmeans = file_loader.load_model('KMeans')

            ##Code changed

            clusters = kmeans.predict(data)
            data['clusters'] = clusters
            clusters = data['clusters'].unique()
            predictions = []
            for i in clusters:
                cluster_data = data[data['clusters'] == i]
                cluster_data = cluster_data.drop(['clusters'], axis=1)
                model_name = file_loader.find_correct_model_file(i)
                model = file_loader.load_model(model_name)
                result = (model.predict(cluster_data))
                for res in result:
                    if res == 0:
                        predictions.append('N')
                    else:
                        predictions.append('Y')

            final = pd.DataFrame(list(zip(predictions)),
                                 columns=['Predictions'])
            path = "Prediction_Output_File/Predictions.csv"
            final.to_csv("Prediction_Output_File/Predictions.csv",
                         header=True,
                         mode='a+')  #appends result to prediction file
            self.log_writer.log(self.file_object, 'End of Prediction')
        except Exception as ex:
            self.log_writer.log(
                self.file_object,
                'Error occured while running the prediction!! Error:: %s' % ex)
            raise ex
        return path
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            data = preprocessor.enocdeCategoricalvalues(data)

            X = data.drop(['class'], axis=1)
            Y = data['class']

            X, Y = preprocessor.handleImbalanceDataset(X, Y)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)
                x_train = preprocessor.scaleData(x_train)
                x_test = preprocessor.scaleData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
예제 #29
0
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
                                                  self.log_writer)
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
                                                      self.log_writer)
            data = preprocessor.remove_columns(
                data, ['Wafer']
            )  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Output')

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=355)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                                                      self.log_writer)
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            self.file_object.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object,
                                'Unsuccessful End of Training')
            self.file_object.close()
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_db_writer.log(self.log_database, self.log_collection,
                               "Start of Training")
        print("training started")
        try:
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.log_database,
                                                  self.log_collection,
                                                  self.execution_id)
            data = data_getter.get_data()

            if data.__len__() == 0:
                self.log_db_writer.log(self.log_database, self.log_collection,
                                       "No record found to train model")
                print("No previous file available")
                return 0
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
                data)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Concrete_compressive _strength')
            # drop the columns obtained above
            #X=preprocessor.remove_columns(X,cols_to_drop)

            X = preprocessor.logTransformation(X)
            print(X)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.execution_id)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
                X
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)
            print("cluster shape details")
            print(X)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                                                     axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    cluster_features,
                    cluster_label,
                    test_size=1 / 3,
                    random_state=36)

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.log_database, self.log_collection,
                    self.execution_id)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                model_metrics = model_finder.get_model_metrics(
                    best_model_name + str(i))

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.log_database,
                                                      self.log_collection,
                                                      self.execution_id)
                print(best_model_name + str(i))
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Successful End of Training')
            #self.log_database.close()

        except Exception:
            # logging the unsuccessful Training
            self.log_db_writer.log(self.log_database, self.log_collection,
                                   'Unsuccessful End of Training')
            #self.log_database.close()
            raise Exception


#trainModelObj = trainModel(1111)  # object initialization
#trainModelObj.trainingModel()  # training the model for the files in the table