def _get_train_and_test(self, data):
     """
     Split data into train and test datasets
     :param data: Pandas Dataframe
     :return X_train: Pandas Dataframe
     :return X_test: Pandas Dataframe
     :return y_train: Pandas Series
     :return y_test: Pandas Series
     """
     X = FeatureProcessing.drop_columns(data, column_list=['avg_bce'])
     y = data['avg_bce']
     # The number 42 is, in The Hitchhiker's Guide to the Galaxy by Douglas Adams, the "Answer to the Ultimate Question of Life, the Universe, and Everything", calculated by an enormous supercomputer named Deep Thought over a period of 7.5 million years.
     X_train, X_test, y_train, y_test = train_test_split(X,
                                                         y,
                                                         test_size=0.1,
                                                         random_state=42)
     logging.info("Test train split done")
     return X_train, X_test, y_train, y_test
 def model_explain(self, X_train, X_test, clf, data):
     class_names = X_train.columns
     categorical_features = DataDescribe._get_categorical_feature(data)
     X_train = X_train.to_numpy()
     X_test = X_test.to_numpy()
     logging.info(f'Explaining model')
     explainer = lime.lime_tabular.LimeTabularExplainer(
         X_train,
         feature_names=class_names,
         class_names=['a'],
         categorical_features=categorical_features,
         verbose=True,
         mode='regression')
     for i in range(len(X_test)):
         exp = explainer.explain_instance(X_test[i],
                                          clf.predict,
                                          num_features=10)
         location = constant._EXPLAIN_MODELPATH + '\lime' + str(i) + '.html'
         logging.info(
             f'Model Prediction explaination for {X_test} has been saved at {location}'
         )
         exp.save_to_file(location)
Пример #3
0
 def load_dataframe(self, filename):
     """
     Find missing values of given data
     :param filename: accepts csv file
     :return: Pandas Dataframe
     """
     df = pd.DataFrame()
     try:
         # df = pd.read_csv('E:\\pa_work_sample_training_data.csv')
         df = pd.read_csv(filename)
         logging.info(f'Shape of loaded Dataframe: {df.shape}')
         # print(df.head())
         logging.info("Dataframe loaded successfully")
     except Exception as e:
         logging.error("Exception occurred", exc_info=True)
         logging.info("Dataframe loaded unsuccessfully")
     return df
Пример #4
0
 def __init__(self, *args, **kwargs):
     super(Model, self).__init__(*args, **kwargs)
     logging.info("Model object created")
Пример #5
0
 def __init__(self, predictor=None, *args, **kwargs):
     predictor = 'avg_bce'
     self.predictorcol = predictor
     logging.info("Data Description object created")
    def benchmark_model(self,
                        data,
                        cross_validation=False,
                        feature_selection=False):
        """
        Function to benchmark and check scores of different models
        :param data: Pandas datframe
        :return Pandas Dataframe
        :return Pandas Dataframe
        :return Pandas Dataframe
        """
        regressor = RandomForestRegressor(n_estimators=10000,
                                          random_state=0,
                                          n_jobs=-1,
                                          max_depth=100)
        # regressor = LinearRegression()
        # regressor = KNeighborsRegressor(n_neighbors=2)

        list_drop_columns = ['te', 'massion', 'idsion', 'isession', 'jments']
        list_label_encoding = ['ticulty', 'success', 'Score']
        list_one_hot = ['type']
        logging.info(f'Shape before feature engineering: {data.shape}')
        df = FeatureProcessing.manual_feature_engineering(
            data, list_drop_columns, list_one_hot, list_label_encoding)
        # df = FeatureProcessing.auto_feature_engineering (data, list_drop_columns, list_one_hot, list_label_encoding)
        logging.info(f'Shape after feature engineering: {df.shape}')
        # then do train & test split
        X_train, X_test, y_train, y_test = FeatureProcessing._get_train_and_test(
            df)
        if feature_selection:
            list_columns = ['te_1.SKEW(_1.nl)']
            X_train, X_test = FeatureProcessing.feature_selection(
                X_train, X_test, list_columns)
            logging.info(
                f'Shape of X_train after feature selection: {X_train.shape}')
        logging.info(
            f'Shape of X_train without feature selection: {X_train.shape}')
        # then standard scalar
        # Standard Scale
        logging.info(f'Doing Standard Scaling')
        Model.scale_quant_features(X_train, X_test)

        if not cross_validation:
            model = Model.fit(regressor, X_train, y_train)
            y_pred = Model.predict(regressor, X_test)
            logging.info(f'mae: {Model._mae ( y_pred, y_test )}')
            logging.info(f'mse: {Model._mse ( y_pred, y_test )}')
            logging.info(f'R square: {Model._R2 ( y_pred, y_test )}')
            logging.info(
                f'Adjusted R Square: {Model.R2_ADJ ( y_pred, y_test, X_train )}'
            )
            print()
        else:
            pass
            model = Model.cv_test_models(X_train, y_train)
        logging.info(f'Creating pickle of model')
        Model.pickle(model)
        # FeatureProcessing.save_csv(X_train,filename = X_train)
        # FeatureProcessing.save_csv(X_test, filename = X_test)
        X_train.to_csv('x_train.csv', sep=',', encoding='utf-8')
        X_test.to_csv('x_test.csv', sep=',', encoding='utf-8')
        logging.info(f'Creating pickle of model')
        for feature in zip([col for col in X_train.columns],
                           regressor.feature_importances_):
            print(feature)
        return model, X_train, X_test
 def __init__(self, *args, **kwargs):
     super(FeatureProcessing, self).__init__(*args, **kwargs)
     logging.info("FeatureProcessing object created")
    def manual_feature_engineering(self, data, list_drop_columns, list_one_hot,
                                   list_label_encoding):
        """
         Generate features
         :param data: Pandas datframe
         :param list_drop_columns: List to contain column names to be dropped
         :param list_one_hot: List to contain column names to do one-hot-encoding
         :param list_label_encoding: List to contain column names to do label encoding
         :return: Python List
         """
        # Label encoding
        data = FeatureProcessing._label_encoder(
            data, column_list=list_label_encoding)

        # Drop columns
        logging.info(f'Shape before drop: {data.shape}')
        data = FeatureProcessing.drop_columns(data,
                                              column_list=list_drop_columns)
        data = FeatureProcessing.drop_columns(data,
                                              column_list=['id_infra_instan'])
        logging.info(f'Shape after drop: {data.shape}')

        # One hot encoding
        data = FeatureProcessing._get_dummies(data, column_list=list_one_hot)

        logging.info(f'Shape after One hot encoding: {data.shape}')
        data.to_csv('data1.csv', sep=',', encoding='utf-8')
        # Feature new columns
        data = FeatureProcessing._get_multiply(data, 'totatness', 'corrrm',
                                               'correpct')
        # Groupby on basis of
        groupbyfeature = ['Tae', 'culty', 'Temarks']
        col_list = ['null', 'nl', 'nall', 'n_all', 'x_all']

        for groupbyfeaturecol in groupbyfeature:
            data = FeatureProcessing._get_groupby_df(data, groupbyfeaturecol,
                                                     col_list, 'mean')
            data = FeatureProcessing._get_groupby_df(data, groupbyfeaturecol,
                                                     col_list, 'max')
            data = FeatureProcessing._get_groupby_df(data, groupbyfeaturecol,
                                                     col_list, 'median')
            data = FeatureProcessing._get_groupby_df(data, groupbyfeaturecol,
                                                     col_list, 'skew')

        data.fillna(0, inplace=True)
        logging.info(f'Shape after feature engineering: {data.shape}')

        col_to_drop = FeatureProcessing._get_correlated_columns(data)
        logging.info(f'Column to drop drop: {col_to_drop}')
        logging.info(f'Shape before correlated feature drop: {data.shape}')
        # data = FeatureProcessing.drop_columns ( data, column_list = col_to_drop )

        logging.info(f'Shape after correlated feature drop: {data.shape}')
        data.to_csv('manual_feature_engg.csv', sep=',', encoding='utf-8')
        return data
    def auto_feature_engineering(self, data, list_drop_columns, list_one_hot,
                                 list_label_encoding):
        """
        Generate features automatically
        :param data: Pandas datframe
        :param list_drop_columns: List to contain column names to be dropped
        :param list_one_hot: List to contain column names to do one-hot-encoding
        :param list_label_encoding: List to contain column names to do label encoding
        :return: Python List
        """

        # Drop columns
        logging.info(f'Shape before drop: {data.shape}')
        data = FeatureProcessing.drop_columns(data,
                                              column_list=list_drop_columns)
        logging.info(f'Shape after drop: {data.shape}')

        # Get data in numerical format
        # One hot encoding
        data = FeatureProcessing._get_dummies(data, column_list=list_one_hot)
        # if 'tx_file_type_0'  in df.columns:
        #     df = FeatureProcessing.drop_columns ( data, column_list = [ 'tx_file_type_0'] )
        logging.info(f'Shape after One hot encoding: {data.shape}')

        # Label encoding
        data = FeatureProcessing._label_encoder(
            data, column_list=list_label_encoding)
        logging.info(f'Shape after label encoding: {data.shape}')

        # Define the varibles type. If not featuretool can also automatically take it.
        pa_variable_types = {
            # 'tx_name': vtypes.Ordinal,
            # 'Task Name': vtypes.Ordinal,
            'tx_difficulty': vtypes.Ordinal
        }
        _inter = pd.cut(data['ggg'], bins=[0, 0.4, 0.7, 1], labels=[1, 2, 3])
        data.insert(5, '_intervals', _inter)
        score_percentage_inter = pd.cut(data['score_percentage'],
                                        bins=[40, 50, 60, 70, 80, 90],
                                        labels=[1, 2, 3, 4, 5])
        data.insert(5, 'score_pe', score_percentage_inter)
        data['_intervals'] = pd.to_numeric(data['_intervals'], errors='coerce')
        data['score_pevals'] = pd.to_numeric(
            data['score_percentage_intervals'], errors='coerce')
        data._intervals = data._intervals.astype(np.float).astype("Int64")
        data.score_percentage_intervals = data.score_percentage_intervals.astype(
            np.float).astype("Int64")
        # data.score_percentage_intervals = data.score_percentage_intervals.astype ( 'int64' )
        data = FeatureProcessing._get_multiply(data, 'hhhhhl', 'nuhhhh',
                                               'mul1')
        data.to_csv('data.csv', sep=',', encoding='utf-8')
        # data [ [ '_intervals' ] ] = data [ [ '_intervals' ] ].astype ( int)
        # data['_intervals'] = data[ 'Time spent' ].transform (lambda x: pd.qcut(x, 4, labels=range(1,5), duplicates='drop'))
        # data [ '_Bin_low' ] = pd.IntervalIndex ( _intervals ).left
        # data [ '_Bin_high' ] = pd.IntervalIndex ( _intervals ).right

        # score_percentage_intervals = data.groupby ( 'Task Name' ) [ 'score_percentage' ].transform ( pd.qcut(data.rank(method='first'),4))
        # data [ 'score_percentage_Bin_low' ] = pd.IntervalIndex ( _intervals ).left
        # data [ 'score_percentage_Bin_high' ] = pd.IntervalIndex ( _intervals ).right
        data.fillna(0, inplace=True)
        # data = FeatureProcessing.drop_columns ( data,
        #                                         column_list = ['Task Name'] )

        # Create an entity set
        logging.info(f'Creating entityset')
        es = ft.EntitySet("pa")
        # Create an entity. Here we are adding entity "pa_tx_info" to our entity "pa"
        logging.info(f'Adding entity to entityset')

        es.entity_from_dataframe(entity_id="pa_1",
                                 dataframe=data,
                                 index="id_infra_instan",
                                 variable_types=pa_variable_types)
        # Here we will use tx_name--Developers & "Task Name" as index for group by

        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="Task_1",
                            index="Task")
        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="success_1",
                            index="success")

        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="_intervals_1",
                            index="_intervals")

        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="score_percentage_intervals_1",
                            index="score_percentage_intervals")

        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="Score_1",
                            index="Score")

        es.normalize_entity(base_entity_id="pa_1",
                            new_entity_id="difficulty_1",
                            index="difficulty")

        # Find interesting values for categorical variables, to be used to generate “where” clauses
        es.add_interesting_values()
        # Plot your
        # es.plot()
        # Get feature_matrix & feature
        logging.info(f'Get feature matrix')
        feature_matrix, features = ft.dfs(entityset=es,
                                          target_entity="pa_1",
                                          verbose=True,
                                          approximate='36d')
        feature_matrix.fillna(0, inplace=True)

        feature_matrix.to_csv('auto_feature_engg.csv',
                              sep=',',
                              encoding='utf-8')
        logging.info(
            f'Automatic feature engineering done.Shape : {feature_matrix.shape}'
        )
        col_to_drop = FeatureProcessing._get_correlated_columns(feature_matrix)
        logging.info(f'Correlating columns to drop: {col_to_drop}')
        logging.info(
            f'Shape before correlated feature drop: {feature_matrix.shape}')
        # feature_matrix = FeatureProcessing.drop_columns ( feature_matrix, column_list = col_to_drop )

        logging.info(
            f'Shape after correlated feature drop: {feature_matrix.shape}')
        # Get X_train, X_test, y_train, y_test
        # X_train, X_test, y_train, y_test = Model._get_train_and_test(features)
        return feature_matrix
Пример #10
0
def df_description():
    df = DataDescribe.load_dataframe ( constant._FILENAME )
    logging.info ( DataDescribe._get_missing_values ( df ) )
    logging.info ( DataDescribe._get_info ( df ) )
    logging.info ( DataDescribe._get_unique_values ( df ) )