Python ReadersWriters примеры использования

Язык программирования: Python

Пространство имен/Пакет: ReadersWriters.ReadersWriters

Класс/Тип: ReadersWriters

Примеров на hotexamples.com: 9

Python ReadersWriters - 9 примеров найдено. Это лучшие примеры Python кода для ReadersWriters.ReadersWriters.ReadersWriters, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ReadersWriters(5)

save_csv(3)

load_csv(2)

save_text(2)

exists_mysql(1)

exists_mysql_column(1)

load_mysql_query(1)

load_serialised(1)

question_overwrite(1)

reset_csv(1)

save_serialised(1)

save_serialised_compressed(1)

Пример #1

Показать файл

 def __init__(self, output_path: str):
     """Initialise the objects and constants.
     :param output_path:
     """
     self.__logger = logging.getLogger(CONSTANTS.app_name)
     self.__logger.debug(__name__)
     self.__output_path = output_path
     self.__readers_writers = ReadersWriters()

Пример #2

Показать файл

 def __init__(self, variables_settings: PandasDataFrame, output_path: str,
              output_table: str):
     """Initialise the objects and constants.
     :param variables_settings:
     :param output_path: the output path.
     :param output_table: the output table name.
     """
     self.__logger = logging.getLogger(CONSTANTS.app_name)
     self.__logger.debug(__name__)
     self.__variables_settings = variables_settings
     self.__output_path = output_path
     self.__output_table = output_table
     self.__readers_writers = ReadersWriters()
     self.__FeatureParserThread = FeatureParserThread()

Пример #3

Показать файл

Файл: Variables.py Проект: minghao2016/T-CARER

 def __init__(self, model_features_table: str, input_path: str,
              output_path: str, input_features_configs: str,
              output_table: str):
     """Initialise the objects and constants.
     :param model_features_table: the feature table name.
     :param input_path: the input path.
     :param output_path: the output path.
     :param input_features_configs: the input features' configuration file.
     :param output_table: the output table name.
     """
     self.__logger = logging.getLogger(CONSTANTS.app_name)
     self.__logger.debug(__name__)
     self.__model_features_table = model_features_table
     self.__output_path = output_path
     self.__output_table = output_table
     self.__readers_writers = ReadersWriters()
     # initialise settings
     self.__variables_settings = self.__init_settings(
         input_path, input_features_configs)
     self.__features_dic_names = self.__init_features_names()
     self.__features_dic_dtypes = self.__init_features_dtypes()
     self.__init_output(output_path, output_table)

Пример #4

Показать файл

    def __init__(self, method_name: str, path: str = None, title: str = None):
        """Initialise the objects and constants.
        :param method_name: the training method that will be used
        (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation,
        'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier,
        'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes,
        'nn': Multi-Layer Perceptron (MLP) Neural Network}).
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)

        self.__readers_writers = ReadersWriters()
        self.__method = None
        self.method_name = method_name
        self.model_labels = None
        self.model_train = None
        self.model_predict = dict()
        self.model_cross_validate = None
        if method_name is not None:
            self.__init__method(method_name)
        else:
            self.load(path, title)

Пример #5

Показать файл

class PreProcess:
    def __init__(self, output_path: str):
        """Initialise the objects and constants.
        :param output_path:
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__output_path = output_path
        self.__readers_writers = ReadersWriters()

    def stats_discrete_df(self, df: PandasDataFrame, includes: List,
                          file_name: str) -> PandasDataFrame:
        """Calculate the odds ratio for all the features that are included and all the categorical states.
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param file_name: the name of the summary output file.
        :return: the summary output.
        """
        self.__logger.debug("Produce statistics for discrete features.")
        summaries = None
        self.__readers_writers.save_csv(path=self.__output_path,
                                        title=file_name,
                                        data=[],
                                        append=False)

        for f_name in includes:
            if f_name in df:
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=["Feature Name", f_name],
                                                append=True)
                summaries = stats.itemfreq(df[f_name])
                summaries = pd.DataFrame({
                    "value": summaries[:, 0],
                    "freq": summaries[:, 1]
                })
                summaries = summaries.sort_values("freq", ascending=False)
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=summaries,
                                                append=True,
                                                header=True)
        return summaries

    def stats_continuous_df(self, df: PandasDataFrame, includes: List,
                            file_name: str) -> PandasDataFrame:
        """Calculate the descriptive statistics for all the included continuous features.
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param file_name: the name of the summary output file.
        :return: the summary output.
        """
        self.__logger.debug("Produce statistics for continuous features.")
        summaries = None
        self.__readers_writers.save_csv(path=self.__output_path,
                                        title=file_name,
                                        data=[],
                                        append=False)

        for f_name in includes:
            if f_name in df:
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=["Feature Name", f_name],
                                                append=True)
                summaries = df[f_name].apply(pd.to_numeric).describe(
                    percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]).transpose()
                summaries = pd.Series.to_frame(summaries).transpose()
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=summaries,
                                                append=True,
                                                header=True)
        return summaries

    def factoring_group_wise(self,
                             df: PandasDataFrame,
                             categories_dic: Dict,
                             labels_dic: Dict,
                             dtypes_dic: Dict,
                             threaded: bool = False) -> PandasDataFrame:
        """Categorise groups of features that are selected.
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :param dtypes_dic: the dictionary of the dtypes of the categorised features.
        :param threaded: indicates if it is multi-threaded.
        :return: the inputted dataframe with categorised features (if applicable).
        """
        self.__logger.debug("Categorise groups of features.")
        categories_dic = OrderedDict(categories_dic)

        if threaded is not True:
            pool_df_encoded = self.__factoring_group_wise_series(
                df, categories_dic, labels_dic)
        else:
            pool_df_encoded = self.__factoring_group_wise_threaded(
                df, categories_dic, labels_dic)

        # encoded labels
        labels_encoded = []
        for label_group in categories_dic.keys():
            labels_encoded += list(categories_dic[label_group].keys())

        # preserve types
        dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic}
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        for label in labels_encoded:
            del dtype_orig[label]

        # combine
        df = df.drop(labels_encoded, axis=1)
        df = pd.concat([df] + pool_df_encoded, axis=1)
        df = df.astype(dtype_orig)
        return df

    def __factoring_group_wise_series(self, df: PandasDataFrame,
                                      categories_dic: Dict,
                                      labels_dic: Dict) -> List:
        """Categorise a group of features that are selected (single-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise groups of features (single-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        pool_df_encoded = []

        try:
            for label_group in categories_dic.keys():
                pool_df_encoded.append(
                    factoring_thread.factor_arr_multiple(label_group))
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def __factoring_group_wise_threaded(self, df: PandasDataFrame,
                                        categories_dic: Dict,
                                        labels_dic: Dict) -> List:
        """Categorise a group of features that are selected (multi-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise groups of features (multi-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        try:
            with mp.Pool(processes=(mp.cpu_count() - 1)) as pool:
                pool_df_encoded = pool.map(
                    partial(factoring_thread.factor_arr_multiple),
                    categories_dic.keys())
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def factoring_feature_wise(self,
                               df: PandasDataFrame,
                               categories_dic: Dict,
                               labels_dic: Dict,
                               dtypes_dic: Dict,
                               threaded: bool = False) -> PandasDataFrame:
        """Categorise features that are selected.
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :param dtypes_dic: the dictionary of the dtypes of the categorised features.
        :param threaded: indicates if it is multi-threaded.
        :return: the inputted dataframe with categorised features (if applicable).
        """
        self.__logger.debug("Categorise.")
        categories_dic = OrderedDict(categories_dic)

        if threaded is not True:
            pool_df_encoded = self.__factoring_feature_wise_series(
                df, categories_dic, labels_dic)
        else:
            pool_df_encoded = self.__factoring_feature_wise_threaded(
                df, categories_dic, labels_dic)

        # encoded labels
        labels_encoded = list(categories_dic.keys())

        # preserve types
        dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic}
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        for label in labels_encoded:
            del dtype_orig[label]

        # combine
        df = df.drop(labels_encoded, axis=1)
        df = pd.concat([df] + pool_df_encoded, axis=1)
        df = df.astype(dtype_orig)
        return df

    def __factoring_feature_wise_series(self, df: PandasDataFrame,
                                        categories_dic: Dict,
                                        labels_dic: Dict) -> List:
        """Categorise features that are selected (single-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise (single-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        pool_df_encoded = []

        try:
            for label_group in categories_dic.keys():
                pool_df_encoded.append(
                    factoring_thread.factor_arr(label_group))
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def __factoring_feature_wise_threaded(self, df: PandasDataFrame,
                                          categories_dic: Dict,
                                          labels_dic: Dict) -> List:
        """Categorise features that are selected (multi-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise (multi-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        try:
            with mp.Pool() as pool:
                pool_df_encoded = pool.map(
                    partial(factoring_thread.factor_arr),
                    categories_dic.keys())
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def transform_df(self,
                     df: PandasDataFrame,
                     excludes: List,
                     transform_type: str,
                     threaded: bool = False,
                     method_args: Dict = None,
                     **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar',
        'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox')
        :param threaded: indicates if it is multi-threaded.
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the inputted dataframe with transformed features (if applicable).
        """
        self.__logger.info("Transform Features.")
        excludes = set(excludes)
        includes = [
            label for label in df.columns.values if label not in excludes
        ]
        method_args = dict() if method_args is None else method_args

        # preserve types
        dtype_orig = df.dtypes.to_dict()
        for label in includes:
            dtype_orig[label] = 'f8'
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        df = df.astype(dtype_orig)

        # transform
        if threaded is False:
            df, method_args = self.__transform_df_series(
                df, includes, transform_type, **kwargs)
        else:
            df, method_args = self.__transform_df_threaded(
                df, includes, transform_type, method_args, **kwargs)
        return df, method_args

    def __transform_df_series(self,
                              df: PandasDataFrame,
                              includes: List,
                              transform_type: str,
                              method_args: Dict = None,
                              **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method (single-threaded).
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar',
        'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox')
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the transformed feature.
        """
        self.__logger.debug("Transform features (single-threaded).")
        transform_thread = TransformThread(**kwargs)
        method_args = dict() if method_args is None else method_args

        try:
            if transform_type == "scale":
                for name in includes:
                    transform_thread.transform_scale_arr(df, method_args, name)
            elif transform_type == "robust_scale":
                for name in includes:
                    transform_thread.transform_robust_scale_arr(
                        df, method_args, name)
            elif transform_type == "max_abs_scalar":
                for name in includes:
                    transform_thread.transform_max_abs_scalar_arr(
                        df, method_args, name)
            elif transform_type == "normalizer":
                for name in includes:
                    transform_thread.transform_normalizer_arr(
                        df, method_args, name)
            elif transform_type == "kernel_centerer":
                for name in includes:
                    transform_thread.transform_kernel_centerer_arr(
                        df, method_args, name)
            elif transform_type == "yeo_johnson":
                for name in includes:
                    transform_thread.transform_yeo_johnson_arr(
                        df, method_args, name)
            elif transform_type == "box_cox":
                for name in includes:
                    transform_thread.transform_box_cox_arr(
                        df, method_args, name)
            else:
                raise Exception(transform_type)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        return df, method_args

    def __transform_df_threaded(self,
                                df: PandasDataFrame,
                                includes: List,
                                transform_type: str,
                                method_args: Dict = None,
                                **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method (multi-threaded).
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param transform_type: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the transformed feature.
        """
        self.__logger.debug("Transform features (multi-threaded).")
        manager = mp.Manager()
        dt = manager.dict(
            list(zip(df[includes].columns, df[includes].T.values.tolist())))
        transform_thread = TransformThread(**kwargs)
        method_args = dict() if method_args is None else method_args

        # run
        try:
            with mp.Pool(processes=(mp.cpu_count() - 1)) as pool:
                if transform_type == "scale":
                    pool.map(
                        partial(transform_thread.transform_scale_arr, dt,
                                method_args), includes)
                elif transform_type == "robust_scale":
                    pool.map(
                        partial(transform_thread.transform_robust_scale_arr,
                                dt, method_args), includes)
                elif transform_type == "max_abs_scalar":
                    pool.map(
                        partial(transform_thread.transform_max_abs_scalar_arr,
                                dt, method_args), includes)
                elif transform_type == "normalizer":
                    pool.map(
                        partial(transform_thread.transform_normalizer_arr, dt,
                                method_args), includes)
                elif transform_type == "kernel_centerer":
                    pool.map(
                        partial(transform_thread.transform_kernel_centerer_arr,
                                dt, method_args), includes)
                elif transform_type == "yeo_johnson":
                    pool.map(
                        partial(transform_thread.transform_yeo_johnson_arr, dt,
                                method_args), includes)
                elif transform_type == "box_cox":
                    pool.map(
                        partial(transform_thread.transform_box_cox_arr, dt,
                                method_args), includes)
                else:
                    raise Exception(transform_type)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        # set
        for k, v in dt.items():
            df[k] = v

        return df, method_args

    def high_linear_correlation_df(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_corr_cut: float = 0.95,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected highly linearly correlated features.
        The Pearson correlation coefficient was calculated for all the pair of variables to measure linear dependence
        between them.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_corr_cut: the numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with high linear correlation (if applicable).")
        corr = None
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        summaries = OrderedDict()

        # search
        if to_search is True:
            corr = df[[col for col in df.columns
                       if col not in excludes]].corr(method='pearson')
            for label in corr.columns.values:
                matches_temp = list(
                    corr[abs(corr[label]) >= thresh_corr_cut].index)
                if len(matches_temp) > 1:
                    # set matches
                    try:
                        matches_temp.remove(label)
                    except ValueError and AttributeError:
                        pass  # not in some-list! OR not behaving like a list!
                    matches = np.union1d(matches, matches_temp)

                    # summaries
                    for match in matches_temp:
                        if match in summaries.keys():
                            matches_temp.remove(match)
                    if len(matches_temp) > 0:
                        summaries[label] = matches_temp
                        self.__logger.info("High Linear Correlation: " +
                                           label + " ~ " + str(matches_temp))

        # delete
        df = self.__remove(
            df, summaries, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
            summaries["Correlation Matrix"] = corr
        return df, summaries

    def near_zero_var_df_sklearn(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_variance: float = 0.05,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected near-zero-variance features (Scikit algorithm).
        Feature selector that removes all low-variance features.
        This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be
        used for unsupervised learning.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_variance: Features with a training-set variance lower than this threshold will be removed.
        The default is to keep all features with non-zero variance, i.e. remove the features that have the same
        value in all samples.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with near-zero-variance (if applicable), using Scikit algorithm."
        )
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        indices = OrderedDict()
        summaries = OrderedDict()

        # find indices
        for label in df.columns.values():
            indices[df.columns.get_loc(label)] = label

        # search
        if to_search is True:
            variances_ = feature_selection.VarianceThreshold(thresh_variance)
            matches_indices = variances_.get_support(indices=True)
            matches_labels = [indices[index] for index in matches_indices]
            for match in matches_labels:
                if match not in excludes:
                    matches += [match]

        # delete
        df = self.__remove(
            df, {'NZV': list(matches)}, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
        return df, summaries

    def near_zero_var_df(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_unique_cut: float = 100,
            thresh_freq_cut: float = 1000,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected near-zero-variance features (custom algorithm).
        The features that had constant counts less than or equal a threshold may be filtered out,
        to exclude highly constants and near-zero variances.
        Rules are as the following:
         - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be
           greater than a threshold;
         - Percent of unique values: The number of unique values divided by the total number of samples to be greater
           than the threshold.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples
        (upper limit). e.g. 10 * 100 / 100.
        :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value
        (lower limit). e.g. 95/5.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with near-zero-variance (if applicable), using custom algorithm."
        )
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        summaries = OrderedDict()

        # search
        if to_search is True:
            for label in df.columns.values:
                # set match and summaries
                # check of NaN
                if not isinstance(df[label].iloc[0], (int, np.int, float, np.float)) \
                        or np.isnan(np.sum(df[label])):
                    matches += [label]
                    continue
                # check of near zero variance
                match, summaries[label] = self.__near_zero_var(
                    df[label], label, excludes, thresh_unique_cut,
                    thresh_freq_cut)
                if match is True:
                    matches += [label]
                    self.__logger.info("Near Zero Variance: " + label)

        # to_remove
        df = self.__remove(
            df, {'NZV': list(matches)}, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
        return df, summaries

    def __near_zero_var(self, arr: List, label: str, excludes: set,
                        thresh_unique_cut: float,
                        thresh_freq_cut: float) -> [bool, Dict]:
        """Assess a single feature for near-zero-variance (custom algorithm).
        The features that had constant counts less than or equal a threshold may be filtered out,
        to exclude highly constants and near-zero variances.
        Rules are as the following:
         - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be
           greater than a threshold;
         - Percent of unique values: The number of unique values divided by the total number of samples to be greater
           than the threshold.

        :param arr: the feature value.
        :param label: the feature name.
        :param excludes: the name of excluded features.
        :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples
        (upper limit). e.g. 10 * 100 / 100.
        :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value
        (lower limit). e.g. 95/5.
        :return: indicates if the feature has near-zero-variance.
        """
        self.__logger.debug(
            "Find near-zero-variance (if applicable), using custom algorithm.")
        unique, counts = np.unique(arr, return_counts=True)
        if len(counts) == 1:
            return True, {'unique': list(unique), 'counts': list(counts)}
        else:
            counts = sorted(counts, reverse=True)
            if label not in excludes and (len(unique) * 100) / float(
                    len(arr)) > thresh_unique_cut:
                return True, {'unique': list(unique), 'counts': list(counts)}
            if label not in excludes and counts[0] / float(
                    counts[1]) > thresh_freq_cut:
                return True, {'unique': list(unique), 'counts': list(counts)}
            else:
                return False, {'unique': list(unique), 'counts': list(counts)}

    def __remove(self,
                 df: PandasDataFrame,
                 dict_matches: Dict,
                 to_search: bool,
                 path: str,
                 section: str = "features") -> PandasDataFrame:
        """Confirm removals and if confirmed, then re-read the selected features, then remove
        :param df: the features dataframe.
        :param dict_matches: the matched features.
        :param to_search: to search or use the saved configuration.
        :param path: the file path to the configuration file.
        :param section: the section name in the configuration file.
        :return: the updated features.
        """
        self.__logger.debug("Confirm removals and implement removal process.")
        config = PyConfigParser(path, CONSTANTS.app_name)

        if to_search is True:
            # write to config
            config.reset()
            config.write_dict(dict_matches, section)
            # confirm
            response = self.__readers_writers.question_overwrite(
                "the features defined in the following file to be removed: " +
                path)
            if response is False:
                config.reset()
                return df

        # if to_search is False or response was yes then read from config
        config.refresh()
        dict_matches = config.read_dict(section)

        # remove
        self.__logger.debug("The feature removal list: " +
                            ",".join(dict_matches))
        labels = [
            label for label_group in dict_matches.values()
            for label in label_group if label in df
        ]
        if len(labels) > 0:
            df = df.drop(labels, axis=1)
        return df

Пример #6

Показать файл

class FeatureParser:
    def __init__(self, variables_settings: PandasDataFrame, output_path: str,
                 output_table: str):
        """Initialise the objects and constants.
        :param variables_settings:
        :param output_path: the output path.
        :param output_table: the output table name.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__variables_settings = variables_settings
        self.__output_path = output_path
        self.__output_table = output_table
        self.__readers_writers = ReadersWriters()
        self.__FeatureParserThread = FeatureParserThread()

    def generate(self, history_table: str, features: PandasDataFrame,
                 variables: PandasDataFrame,
                 prevalence: Dict) -> PandasDataFrame:
        """

        :param history_table: the source table alias name (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param features: the output features.
        :param variables: the input variables.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the output features.
        """
        variables_settings = self.__variables_settings[
            self.__variables_settings["Table_History_Name"] == history_table]

        for _, row in variables_settings.iterrows():
            self.__logger.info("variable: " + row["Variable_Name"] + " ...")

            if not pd.isnull(row["Variable_Aggregation"]):
                postfixes = row["Variable_Aggregation"].replace(' ',
                                                                '').split(',')
                # aggregate stats
                features_temp = self.__aggregate(
                    variables[row["Variable_Name"]],
                    row["Variable_Type_Original"], postfixes,
                    prevalence[row["Variable_Name"]])
                for p in range(len(postfixes)):
                    # feature name
                    feature_name = row["Variable_Name"] + "_" + postfixes[p]
                    # set
                    features[feature_name] = features_temp[:, p]
            else:
                # init and replace none by zero
                features_temp = np.nan_to_num(variables[row["Variable_Name"]])
                features_temp = np.where(features_temp == np.array(None), 0,
                                         features_temp)
                # set
                features[row["Variable_Name"]] = features_temp
        return features

    def __aggregate(self, variable: PandasDataFrame, variable_type: str,
                    postfixes: str, prevalence: Dict) -> NumpyNdarray:
        """

        :param variable: the input variable.
        :param variable_type: the type of input variable.
        :param postfixes: name of the aggregation functions.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the aggregated variable.
        """
        try:
            with mp.Pool() as pool:
                features_temp = pool.map(
                    partial(self.__FeatureParserThread.aggregate_cell,
                            postfixes, variable_type, prevalence), variable)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        features_temp = np.asarray(features_temp)
        return features_temp

    def prevalence(self, variable: PandasDataFrame,
                   variable_name: str) -> List:
        """
        :param variable: the input variable.
        :param variable_name: the name of the input variable.
        :return: the prevalence of values for all the variables.
        """
        try:
            with mp.Pool() as pool:
                prevalence_temp = pool.map(
                    partial(self.__FeatureParserThread.prevalence_cell),
                    variable)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        prevalence_temp = [sub2 for sub1 in prevalence_temp for sub2 in sub1]
        prevalence = Counter(prevalence_temp).most_common()
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table, [
                variable_name, '; '.join(
                    [str(p[0]) + ":" + str(p[1]) for p in prevalence])
            ],
            append=True,
            ext="txt")
        prevalence = [p[0] for p in prevalence]
        return prevalence

Пример #7

Показать файл

Файл: Variables.py Проект: minghao2016/T-CARER

class Variables:
    def __init__(self, model_features_table: str, input_path: str,
                 output_path: str, input_features_configs: str,
                 output_table: str):
        """Initialise the objects and constants.
        :param model_features_table: the feature table name.
        :param input_path: the input path.
        :param output_path: the output path.
        :param input_features_configs: the input features' configuration file.
        :param output_table: the output table name.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__model_features_table = model_features_table
        self.__output_path = output_path
        self.__output_table = output_table
        self.__readers_writers = ReadersWriters()
        # initialise settings
        self.__variables_settings = self.__init_settings(
            input_path, input_features_configs)
        self.__features_dic_names = self.__init_features_names()
        self.__features_dic_dtypes = self.__init_features_dtypes()
        self.__init_output(output_path, output_table)

    def set(self, input_schemas: List, input_tables: List,
            history_tables: List, column_index: str, query_batch_size: int):
        """Set the variables by reading the selected features from MySQL database.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_size: the number of rows to be read in each batch.
        :return:
        """
        self.__logger.debug(__name__)
        query_batch_start, query_batch_max = self.__init_batch(
            input_schemas[0], input_tables[0])
        features_names, features_dtypes = self.__set_features_names_types()
        self.__validate_mysql_names(input_schemas, input_tables)
        prevalence = self.__init_prevalence(input_schemas, input_tables,
                                            history_tables)
        self.__set_batch(features_names, features_dtypes, input_schemas,
                         input_tables, history_tables, column_index,
                         prevalence, query_batch_start, query_batch_max,
                         query_batch_size)

    def __init_settings(self, input_path: str,
                        input_features_configs: str) -> PandasDataFrame:
        """Read and set the settings of input variables that are selected.
        :param input_path: the path of the input file.
        :param input_features_configs: the input features' configuration file.
        :return: the input variables settings.
        """
        self.__logger.debug(__name__)
        variables_settings = self.__readers_writers.load_csv(
            input_path, input_features_configs, 0, True)
        variables_settings = variables_settings.loc[
            (variables_settings["Selected"] == 1)
            & (variables_settings["Table_Reference_Name"] ==
               self.__model_features_table)]
        variables_settings = variables_settings.reset_index()
        return variables_settings

    def __init_features_names(self) -> Dict:
        """Generate the features names, based on variable name, source table alias name (a.k.a. history table
            name), and the aggregation function name.
        :return: the name of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_names = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            if not pd.isnull(row["Variable_Aggregation"]):
                postfixes = row["Variable_Aggregation"].replace(' ',
                                                                '').split(',')
                for postfix in postfixes:
                    features_names[row["Table_History_Name"]].append(
                        row["Variable_Name"] + "_" + postfix)
            else:
                features_names[row["Table_History_Name"]].append(
                    row["Variable_Name"])
        return features_names

    def __init_features_dtypes(self) -> Dict:
        """Generate the features types, based on the input configuration file.
        :return: the dtypes of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_dtypes = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            feature_types = row["Variable_dType"].replace(' ', '').split(',')
            for feature_type in feature_types:
                features_dtypes[row["Table_History_Name"]].append(feature_type)
        return features_dtypes

    def __init_output(self, output_path: str, output_table: str):
        """Initialise the output file by writing the header row.
        :param output_path: the output path.
        :param output_table: the output table name.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        self.__readers_writers.reset_csv(output_path, output_table)
        self.__readers_writers.save_csv(output_path,
                                        output_table,
                                        features_names,
                                        append=False)

    def __init_prevalence(self, input_schemas: List, input_tables: List,
                          history_tables: List) -> Dict:
        """Generate the prevalence dictionary of values for all the variables.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :return: the prevalence dictionary of values for all the variables.
        """
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table,
            ["Feature Name", "Top Prevalence Feature Name"],
            append=False,
            ext="ini")
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table, ["Feature Name", "Prevalence & Freq."],
            append=False,
            ext="txt")
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        prevalence = dict()

        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            prevalence[input_tables[table_i]] = dict()

            # for features
            for _, row in variables_settings.iterrows():
                self.__logger.info("Prevalence: " + row["Variable_Name"] +
                                   " ...")
                if not pd.isnull(row["Variable_Aggregation"]):
                    # read features
                    variables = self.__init_prevalence_read(
                        input_schemas[table_i], input_tables[table_i],
                        row["Variable_Name"])

                    # validate
                    if variables is None or len(variables) == 0:
                        continue

                    # prevalence
                    prevalence[input_tables[table_i]][row["Variable_Name"]] = \
                        feature_parser.prevalence(variables[row["Variable_Name"]], row["Variable_Name"])

                    # for sub features
                    postfixes = row["Variable_Aggregation"].replace(
                        ' ', '').split(',')
                    for p in range(len(postfixes)):
                        feature_name = row["Variable_Name"] + "_" + postfixes[p]
                        if len(postfixes[p]
                               ) > 11 and postfixes[p][0:11] == "prevalence_":
                            index = int(postfixes[p].split('_')[1]) - 1
                            feature_name_prevalence = "None"
                            if index < len(prevalence[input_tables[table_i]][
                                    row["Variable_Name"]]):
                                feature_name_prevalence = \
                                    feature_name + "_" + \
                                    str(prevalence[input_tables[table_i]][row["Variable_Name"]][index])
                            # save prevalence
                            self.__readers_writers.save_text(
                                self.__output_path,
                                self.__output_table,
                                [feature_name, feature_name_prevalence],
                                append=True,
                                ext="ini")
        return prevalence

    def __init_prevalence_read(self, input_schema: str, input_table: str,
                               variable_name: str) -> PandasDataFrame:
        """Read a variable from database, to calculate the prevalence of the values.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param variable_name: the variable name.
        :return: the selected variable.
        """
        query = "SELECT `" + variable_name + "` FROM `" + input_table + "`;"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __init_batch(self, input_schema: str, input_table: str) -> [int, int]:
        """Find the minimum and maximum value of the index column, to use when reading mysql tables in
            batches.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :return: the minimum and maximum of the index column.
        """
        self.__logger.debug(__name__)
        query = "select min(localID), max(localID) from `" + input_table + "`;"
        output = list(
            self.__readers_writers.load_mysql_query(query,
                                                    input_schema,
                                                    dataframing=False))
        if [r[0] for r in output][0] is None:
            self.__logger.error(__name__ + " No data is found: " + query)
            sys.exit()

        query_batch_start = int([r[0] for r in output][0])
        query_batch_max = int([r[1] for r in output][0])
        return query_batch_start, query_batch_max

    def __set_features_names_types(self):
        """Produce the sorted lists of features names and features dtypes.
        :return: the sorted lists of features names and features dtypes.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        features_dtypes = [
            pd.Series(dtype=f) for k in keys
            for f in self.__features_dic_dtypes[k]
        ]
        features_dtypes = pd.DataFrame(
            dict(zip(features_names, features_dtypes))).dtypes
        return features_names, features_dtypes

    def __set_batch(self, features_names: list, features_dtypes: Dict,
                    input_schemas: List, input_tables: List,
                    history_tables: List, column_index: str, prevalence: Dict,
                    query_batch_start: int, query_batch_max: int,
                    query_batch_size: int):
        """Using batch processing first read variables, then generate features and write them into output.
        :param features_names: the name of features that are selected.
        :param features_dtypes: the dtypes of features that are selected.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        """
        self.__logger.debug(__name__)
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        step = -1
        batch_break = False

        while not batch_break:
            step += 1
            features = None
            for table_i in range(len(input_schemas)):
                self.__logger.info("Batch: " + str(step) + "; Table: " +
                                   input_tables[table_i])

                # read job
                variables = self.__set_batch_read(input_schemas[table_i],
                                                  input_tables[table_i], step,
                                                  column_index,
                                                  query_batch_start,
                                                  query_batch_max,
                                                  query_batch_size)

                # validate
                if variables is None:
                    batch_break = True
                    break
                elif len(variables) == 0:
                    continue

                # process job
                if features is None:
                    features = pd.DataFrame(0,
                                            index=range(len(variables)),
                                            columns=features_names)
                    features = features.astype(dtype=features_dtypes)
                features = self.__set_batch_process(
                    feature_parser, history_tables[table_i], features,
                    variables, prevalence[input_tables[table_i]])

            # write job
            if features is not None:
                features = features.astype(dtype=features_dtypes)
                self.__set_batch_write(features)

    def __set_batch_read(
            self, input_schema: str, input_table: str, step: int,
            column_index: str, query_batch_start: int, query_batch_max: int,
            query_batch_size: int) -> Callable[[PandasDataFrame, None], None]:
        """Read the queried variables.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param step: the batch id.
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        :return: the queried variables.
        """
        step_start = query_batch_start + step * query_batch_size
        step_end = step_start + query_batch_size
        if step_start >= query_batch_max:
            return None
        # read
        query = "SELECT * FROM `" + input_table + \
                "` WHERE `" + str(column_index) + "` >= " + str(step_start) + \
                " AND `" + str(column_index) + "` < " + str(step_end) + ";"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __set_batch_process(self, feature_parser: FeaturesFeatureParser,
                            history_table: str, features: PandasDataFrame,
                            variables: PandasDataFrame,
                            prevalence: List) -> PandasDataFrame:
        """Process variables and generate features.
        :param feature_parser:
        :param history_table: the source table alias name (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param features: the output features.
        :param variables: the input variables.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the generated features.
        """
        return feature_parser.generate(history_table, features, variables,
                                       prevalence)

    def __set_batch_write(self, features: PandasDataFrame):
        """Write the features into an output file.
        :param features: the output features.
        """
        self.__readers_writers.save_csv(self.__output_path,
                                        self.__output_table,
                                        features,
                                        append=True)

    def __validate_mysql_names(self, input_schemas: List,
                               history_tables: List):
        """Validate mysql tables and their columns, and generate exception if table/column name is invalid.
        :param input_schemas: the mysql database schemas.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        """
        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            # validate table name
            if not self.__readers_writers.exists_mysql(
                    input_schemas[table_i], history_tables[table_i]):
                self.__logger.error(__name__ + " - Table does not exist: " +
                                    history_tables[table_i])
                sys.exit()

            # for features
            for _, row in variables_settings.iterrows():
                # validate column name
                if not self.__readers_writers.exists_mysql_column(
                        input_schemas[table_i], history_tables[table_i],
                        row["Variable_Name"]):
                    self.__logger.error(__name__ +
                                        " - Column does not exist: " +
                                        row["Variable_Name"])
                    sys.exit()

Пример #8

Показать файл

class TrainingMethod:
    def __init__(self, method_name: str, path: str = None, title: str = None):
        """Initialise the objects and constants.
        :param method_name: the training method that will be used
        (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation,
        'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier,
        'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes,
        'nn': Multi-Layer Perceptron (MLP) Neural Network}).
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)

        self.__readers_writers = ReadersWriters()
        self.__method = None
        self.method_name = method_name
        self.model_labels = None
        self.model_train = None
        self.model_predict = dict()
        self.model_cross_validate = None
        if method_name is not None:
            self.__init__method(method_name)
        else:
            self.load(path, title)

    def __init__method(self,
                       method_name: str,
                       model_labels: List = None,
                       model_train: Any = None,
                       model_predict: Dict = None,
                       model_cross_validate: NumpyNDArray = None):
        """Initialise the selected training method.
        :param method_name: the training method that will be used
        (options: {'lr': Logistic Regression, 'lr_cv': Logistic Regression with Cross-Validation,
        'mlm': Mixed Linear Model, 'rfc': Random Forest Classifier, 'gbc': Gradient Boosting Classifier,
        'dtc' Decision Tree Classifier, 'knc': K-Nearest Neighbors Classifier, 'nb': Multinomial Naive Bayes,
        'nn': Multi-Layer Perceptron (MLP) Neural Network}).
        :param model_labels: the features names to be inputted into the model.
        Note: the order of features will be preserved internally.
        :param model_train: the training model.
        :param model_predict: the prediction outputs.
        :param model_cross_validate: the cross-validation model.
        """
        self.__logger.debug("Initialise the training method.")
        if method_name == "lr":
            self.__method = _LogisticRegression()
        elif method_name == "lr_cv":
            self.__method = _LogisticRegressionCV()
        elif method_name == "mlm":
            self.__method = _MixedLinearModel()
        elif method_name == "rfc":
            self.__method = _RandomForestClassifier()
        elif method_name == "gbc":
            self.__method = _GradientBoostingClassifier()
        elif method_name == "dtc":
            self.__method = _DecisionTreeClassifier()
        elif method_name == "knc":
            self.__method = _KNeighborsClassifier()
        elif method_name == "nb":
            self.__method = _NaiveBayes()
        elif method_name == "nn":
            self.__method = _NeuralNetwork()
        else:
            self.__logger.error(__name__ + " - Invalid training method: " +
                                str(method_name))
            sys.exit()

        self.model_labels = model_labels
        self.model_train = model_train
        self.model_predict = dict() if model_predict is None else model_predict
        self.model_cross_validate = model_cross_validate

    def train(self, features_indep_df: PandasDataFrame, feature_target: List,
              **kwargs: Any) -> Any:
        """Perform the training, using the selected method.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param kwargs: the training method's argument.
        :return: the trained model.
        """
        self.__logger.debug("Train.")
        self.model_labels = list(features_indep_df.columns.values)
        self.model_train = self.__method.train(
            features_indep_df[self.model_labels], feature_target,
            self.model_labels, **kwargs)
        return self.model_train

    def plot(self) -> Any:
        """Plot the tree diagram.
        :return: the model graph.
        """
        self.__logger.debug("Plot.")
        return self.__method.plot(self.model_train, self.model_labels,
                                  ["True", "False"])

    def train_summaries(self) -> Any:
        """ Produce the training summary.
        :return: the training summary.
        """
        self.__logger.debug("Summarise training model.")
        return self.__method.train_summaries(self.model_train)

    def predict(self, features_indep_df: PandasDataFrame,
                sample_name: str) -> PandasDataFrame:
        """Predict probability of labels, using the training model.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :return: the predicted probabilities, and the predicted labels.
        """
        self.__logger.debug("Predict.")
        self.model_predict[sample_name] = self.__method.predict(
            self.model_train, features_indep_df[self.model_labels])
        return self.model_predict[sample_name]

    def predict_summaries(self, feature_target: List,
                          sample_name: str) -> CollectionsOrderedDict:
        """roduce summary statistics for the prediction performance.
        :param feature_target: the target feature, which is being estimated.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :return: the prediction summaries.
        """
        self.__logger.debug("Summarise predictions.")
        self.model_predict[sample_name]['target'] = feature_target
        return self.__method.predict_summaries(self.model_predict[sample_name],
                                               feature_target)

    def predict_summaries_risk_bands(
        self,
        feature_target: List,
        sample_name: str,
        cutoffs: List = np.arange(0, 1.05, 0.05)
    ) -> CollectionsOrderedDict:
        """Produce a summary statistics table for a range of cut-off points.
        :param feature_target: the target feature, which is being estimated.
        :param sample_name: the sample to predict(e.g. 'train', 'test', 'validate').
        :param cutoffs: a list of risk cut-off points.
        :return: the summary statistics table for the cut-off points.
        """
        self.__logger.debug("Summarise predictions.")
        self.model_predict[sample_name]['target'] = feature_target
        return self.__method.predict_summaries_cutoffs_table(
            self.model_predict[sample_name]['score'], feature_target, cutoffs)

    def cross_validate(self,
                       features_indep_df: PandasDataFrame,
                       feature_target: List,
                       scoring: str = "neg_mean_squared_error",
                       cv: int = 10) -> Any:
        """Evaluate the model by performing cross-validation.
        :param features_indep_df: the independent features, which are inputted into the model.
        :param feature_target: the target feature, which is being estimated.
        :param scoring: the scoring method (default: 'neg_mean_squared_error').
        :param cv: the cross-validation splitting strategy (optional).
        :return: the cross-validation summary
        """
        self.__logger.info("Cross-Validate")

        self.model_cross_validate = cross_val_score(
            self.model_train,
            features_indep_df[self.model_labels],
            feature_target,
            scoring=scoring,
            cv=cv)
        return self.model_cross_validate

    def cross_validate_summaries(self) -> Any:
        """Produce a summary of the applied cross-validation
        :return: the cross-validation summary
        """
        return self.model_cross_validate

    def save_model(self, path: str, title: str):
        """Save (pickle) the training model, as well as predictions and cross-validations.
        Note: summaries statistics won't not saved.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application.
        """
        self.__logger.info("Saving model")
        objects = dict()
        objects['method_name'] = self.method_name
        objects['model_labels'] = self.model_labels
        objects['model_train'] = self.model_train
        objects['model_predict'] = self.model_predict
        objects['model_cross_validate'] = self.model_cross_validate
        self.__readers_writers.save_serialised(path, title, objects=objects)

    def save_model_compressed(self, path: str, title: str):
        """Save (pickle) & compressthe training model, as well as predictions and cross-validations.
        Note: summaries statistics won't not saved.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application.
        """
        self.__logger.debug("Save model.")
        objects = dict()
        objects['method_name'] = self.method_name
        objects['model_labels'] = self.model_labels
        objects['model_train'] = self.model_train
        objects['model_predict'] = self.model_predict
        objects['model_cross_validate'] = self.model_cross_validate
        self.__readers_writers.save_serialised_compressed(path,
                                                          title,
                                                          objects=objects)

    def load(self, path: str, title: str):
        """Load (unpickle) the model, which was saved using this application.
        :param path: the directory path of the saved trained model file, using this application (if applicable).
        :param title: the file name of the saved trained model file, using this application
        """
        self.__logger.debug("Load model.")
        objects = self.__readers_writers.load_serialised(path, title)
        try:
            self.__init__method(
                method_name=objects['method_name'],
                model_labels=objects['model_labels'],
                model_train=objects['model_train'],
                model_predict=objects['model_predict'],
                model_cross_validate=objects['model_cross_validate'])
        except ():
            self.__logger.error(__name__ +
                                " - Invalid field(s) in the model file: " +
                                path)
            sys.exit()

Пример #9

Показать файл

    os.makedirs(io_path, exist_ok=True)

logger = Logger(path=io_path, app_name=app_name, ext="log")
logger = logging.getLogger(app_name)

# Initialise constants and some of classes

# In[ ]:

# Initialise constants
CONSTANTS.set(io_path, app_name)

# In[ ]:

# Initialise other classes
readers_writers = ReadersWriters()
preprocess = PreProcess(io_path)

# In[ ]:

# Set print settings
pd.set_option('display.width', 1600, 'display.max_colwidth', 800)
pp = pprint.PrettyPrinter(indent=4)

# ### 1.2.  Initialise Features Metadata

# Read the input features' confugration file &amp; store the features metadata

# In[ ]:

# variables settings