예제 #1
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
            #data.replace('?',np.NaN,inplace=True) # replacing '?' with NaN values for imputation

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data,

            # Dropping column after performing EDA
            preprocessor_cus = preprocess_cus.Preprocessor_cus(
                self.file_object, self.log_writer)
            X = preprocessor_cus.drop_column(X)
                "no": 0,
                "yes": 1
            }, inplace=True)  # Encoding Y (Predection label)

            # Response Encoding process
            # cat_cols2 = preprocessor_cus.categorical_column(X)
            # X = preprocessor_cus.ResponseEncoder(cat_cols2, X, Y)
            X = preprocessor_cus.test_data_encode(X)  # Using Predefined Values

            print("Shape of the dataset after encoding: ", X.shape)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(
                    X)  # missing value imputation

            features, label = preprocessor.handleImbalanceDataset(X, Y)
            # splitting the data into training and test set for each cluster one by one
            x_train, x_test, y_train, y_test = train_test_split(
                features, label, test_size=1 / 5, random_state=42)

            model_finder = tuner.Model_Finder(
                self.file_object, self.log_writer)  # object initialization

            #getting the best model for each of the clusters
            best_model_name, best_model = model_finder.get_best_model(
                x_train, y_train, x_test, y_test)

            #saving the best model to the directory.
            file_op = file_methods.File_Operation(self.file_object,
            save_model = file_op.save_model(best_model, best_model_name)

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception as e:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise Exception
예제 #2
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object, self.log_writer)
            data = data_getter.get_data()

            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object, self.log_writer)
            data = preprocessor.remove_columns(data, [
                'Wafer'])  # remove the unnamed column as it doesn't contribute to prediction.

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(data, label_column_name='Output')

            # check if missing values are present in the dataset
            is_null_present = preprocessor.is_null_present(X)

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                X = preprocessor.impute_missing_values(X)  # missing value imputation

            # check further which columns do not contribute to predictions
            # if the standard deviation for a column is zero, it means that the column has constant values
            # and they are giving the same output both for good and bad sensors
            # prepare the list of such columns to drop
            cols_to_drop = preprocessor.get_columns_with_zero_std_deviation(X)

            # drop the columns obtained above
            X = preprocessor.remove_columns(X, cols_to_drop)

            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(X)  # using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            # create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()

            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] == i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'], axis=1)
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(cluster_features, cluster_label, test_size=1 / 3,

                model_finder = tuner.Model_Finder(self.file_object, self.log_writer)  # object initialization

                # getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(x_train, y_train, x_test, y_test)

                # saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object, self.log_writer)
                save_model = file_op.save_model(best_model, best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception:
            # logging the unsuccessful Training
            self.log_writer.log(self.file_object, 'Unsuccessful End of Training')
            raise Exception
예제 #3
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
            data = preprocessor.remove_columns(data, [
                'policy_number', 'policy_bind_date', 'policy_state',
                'insured_zip', 'incident_location', 'incident_date',
                'incident_state', 'incident_city', 'insured_hobbies',
                'auto_make', 'auto_model', 'auto_year', 'age',
            ])  # remove the column as it doesn't contribute to prediction.
                '?', np.NaN,
                inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation
            #encode categorical data
            data = preprocessor.encode_categorical_columns(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='fraud_reported')
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,
                # Proceeding with more data pre-processing steps
                x_train = preprocessor.scale_numerical_columns(x_train)
                x_test = preprocessor.scale_numerical_columns(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception as e:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise Exception
예제 #4
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            #removing unwanted columns as discussed in the EDA part in ipynb file
            #data = preprocessor.dropUnnecessaryColumns(data,['veiltype'])

            #repalcing '?' values with np.nan as discussed in the EDA part

            data = preprocessor.replaceInvalidValuesWithNull(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation

            # get encoded values for categorical data

            #data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='Result')

            # drop the columns obtained above
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise Exception
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """ doing the data preprocessing. 
            All the pre processing steps are based on the EDA done previously
            1. Duplicate
            2. Remove columns: 	"serial","rate","listed_in(type)","listed_in(city)"
            3. Null removal
            4. Convert cost column to number
            5. Categorical to Numerical

            preprocessor = preprocessing.Preprocessor(self.file_object,

            #removing unwanted columns as discussed in the EDA part in ipynb file
            data = preprocessor.dropUnnecessaryColumns(
                data, ["serial", "rate", "listed_in(type)", "listed_in(city)"])

            # removing the duplicates
            data = preprocessor.removeDuplicates(data)

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                # here we won't do any imputation, just to show one more way, we'll drop the missing values
                data = data.dropna(how='any')

            # cost value to float
            data = preprocessor.convertCostToNumber(data)

            # get encoded values for categorical data

            data = preprocessor.encodeCategoricalValues(data)

            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='approx_cost(for two people)')
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,

                x_train_scaled = preprocessor.standardScalingData(x_train)
                x_test_scaled = preprocessor.standardScalingData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train_scaled, y_train, x_test_scaled, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception as e:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise e
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,

            #removing unwanted columns as discussed in the EDA part in ipynb file
            print('Dropping Unnecessary columns done')
            data = preprocessor.dropUnnecessaryColumns(data, ['veil-type'])
            print('Operation Done!!')

            #repalcing '?' values with np.nan as discussed in the EDA part
            print('Replace Invalid Values with NULL')
            data = preprocessor.replaceInvalidValuesWithNull(data)
            print('Operation Done!!')

            # check if missing values are present in the dataset
            print('Getting columns for NULL values')
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(
            print('Operation Done!!')

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                print('Imputing Missing values!!')
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation
                print('Operation Done')

            # get encoded values for categorical data
            data = preprocessor.encodeCategoricalValues(data)

            data.to_csv('tmp.csv', index=False)
            # create separate features and labels
            X, Y = preprocessor.separate_label_feature(
                data, label_column_name='class')

            # drop the columns obtained above
            """ Applying the clustering approach"""
            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                if 'Labels' in cluster_data:
                    print('Labels Column Found')
                if 'Cluster' in cluster_data:
                    print('Cluster Column Found')
                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                cluster_label = cluster_data['Labels']
                print('Cluster Label and Features Created')
                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,
                print('Train Test Split Done!')
                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                print('Finding best model for cluster: ', i)
                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test, i)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            print('Inserting Performance Metrics to MongoDB')
            for dict_l in self.performance_list:
                                           'performance_metrics', dict_l)
            self.log_writer.log(self.file_object, 'Successful End of Training')
            print('Successfully end training')

            # Triggering Email
            msg = MIMEMultipart()
            msg['Subject'] = 'MushroomTypeClassifier - Model Train | ' + str(
            body = 'Model Training Done Successfully. Please find the models in models/ directory... <br><br> Thanks and Regards, <br> Rahul Garg'
            msg.attach(MIMEText(body, 'html'))
            to_addr = ['*****@*****.**']
            self.emailObj.trigger_mail(to_addr, [], msg)

        except Exception as e:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training: ' + e)
            raise Exception
예제 #7
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
            data = preprocessor.remove_columns(
                data, []
            )  # remove the column as it doesn't contribute to prediction.
                '?', np.NaN,
                inplace=True)  # replacing '?' with NaN values for imputation

            # check if missing values are present in the dataset
            is_null_present, cols_with_missing_values = preprocessor.is_null_present(

            # if missing values are there, replace them appropriately.
            if (is_null_present):
                data = preprocessor.impute_missing_values(
                    data, cols_with_missing_values)  # missing value imputation

            # since in our target column we have multi no labels and it is higly imbalance so i m grouping them using below preprocessor method

            data = preprocessor.grouping_values_of_target(data)

            # create separate two data frames one on which we will perform cluster  and other is attached after performing cluster
            X, Y = preprocessor.separate_data_frame(
                data, label_column_name=['Rings', 'Sex'])
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X = pd.concat([X, Y], axis=1, sort=False)

            # encode categorical data
            X = preprocessor.encode_categorical_columns(X)

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""
            df = pd.DataFrame(
                columns=['Cluster_No', 'Best_Model_Name', 'Roc_Auc_score'])

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Rings', 'Cluster'],
                cluster_label = cluster_data['Rings']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model, Roc_Auc_score = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))
                df = df.append(
                        'Cluster_No': i,
                        'Best_Model_Name': best_model_name + str(i),
                        'Roc_Auc_score': Roc_Auc_score

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')
            return df

        except Exception as e:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise Exception
예제 #8
    def trainingModel(self):
        # Logging the start of Training
        self.log_writer.log(self.file_object, 'Start of Training')
            # Getting the data from the source
            data_getter = data_loader.Data_Getter(self.file_object,
            data = data_getter.get_data()
            """doing the data preprocessing"""

            preprocessor = preprocessing.Preprocessor(self.file_object,
            #data=preprocessor.remove_columns(data,['Wafer']) # remove the unnamed column as it doesn't contribute to prediction.

            data = preprocessor.enocdeCategoricalvalues(data)

            X = data.drop(['class'], axis=1)
            Y = data['class']

            X, Y = preprocessor.handleImbalanceDataset(X, Y)
            """ Applying the clustering approach"""

            kmeans = clustering.KMeansClustering(
                self.file_object, self.log_writer)  # object initialization.
            number_of_clusters = kmeans.elbow_plot(
            )  #  using the elbow plot to find the number of optimum clusters

            # Divide the data into clusters
            X = kmeans.create_clusters(X, number_of_clusters)

            #create a new column in the dataset consisting of the corresponding cluster assignments.
            X['Labels'] = Y

            # getting the unique clusters from our dataset
            list_of_clusters = X['Cluster'].unique()
            """parsing all the clusters and looking for the best ML algorithm to fit on individual cluster"""

            for i in list_of_clusters:
                cluster_data = X[X['Cluster'] ==
                                 i]  # filter the data for one cluster

                # Prepare the feature and Label columns
                cluster_features = cluster_data.drop(['Labels', 'Cluster'],
                cluster_label = cluster_data['Labels']

                # splitting the data into training and test set for each cluster one by one
                x_train, x_test, y_train, y_test = train_test_split(
                    test_size=1 / 3,
                x_train = preprocessor.scaleData(x_train)
                x_test = preprocessor.scaleData(x_test)

                model_finder = tuner.Model_Finder(
                    self.file_object, self.log_writer)  # object initialization

                #getting the best model for each of the clusters
                best_model_name, best_model = model_finder.get_best_model(
                    x_train, y_train, x_test, y_test)

                #saving the best model to the directory.
                file_op = file_methods.File_Operation(self.file_object,
                save_model = file_op.save_model(best_model,
                                                best_model_name + str(i))

            # logging the successful Training
            self.log_writer.log(self.file_object, 'Successful End of Training')

        except Exception:
            # logging the unsuccessful Training
                                'Unsuccessful End of Training')
            raise Exception