示例#1
0
class Variables:
    def __init__(self, model_features_table: str, input_path: str,
                 output_path: str, input_features_configs: str,
                 output_table: str):
        """Initialise the objects and constants.
        :param model_features_table: the feature table name.
        :param input_path: the input path.
        :param output_path: the output path.
        :param input_features_configs: the input features' configuration file.
        :param output_table: the output table name.
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__model_features_table = model_features_table
        self.__output_path = output_path
        self.__output_table = output_table
        self.__readers_writers = ReadersWriters()
        # initialise settings
        self.__variables_settings = self.__init_settings(
            input_path, input_features_configs)
        self.__features_dic_names = self.__init_features_names()
        self.__features_dic_dtypes = self.__init_features_dtypes()
        self.__init_output(output_path, output_table)

    def set(self, input_schemas: List, input_tables: List,
            history_tables: List, column_index: str, query_batch_size: int):
        """Set the variables by reading the selected features from MySQL database.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_size: the number of rows to be read in each batch.
        :return:
        """
        self.__logger.debug(__name__)
        query_batch_start, query_batch_max = self.__init_batch(
            input_schemas[0], input_tables[0])
        features_names, features_dtypes = self.__set_features_names_types()
        self.__validate_mysql_names(input_schemas, input_tables)
        prevalence = self.__init_prevalence(input_schemas, input_tables,
                                            history_tables)
        self.__set_batch(features_names, features_dtypes, input_schemas,
                         input_tables, history_tables, column_index,
                         prevalence, query_batch_start, query_batch_max,
                         query_batch_size)

    def __init_settings(self, input_path: str,
                        input_features_configs: str) -> PandasDataFrame:
        """Read and set the settings of input variables that are selected.
        :param input_path: the path of the input file.
        :param input_features_configs: the input features' configuration file.
        :return: the input variables settings.
        """
        self.__logger.debug(__name__)
        variables_settings = self.__readers_writers.load_csv(
            input_path, input_features_configs, 0, True)
        variables_settings = variables_settings.loc[
            (variables_settings["Selected"] == 1)
            & (variables_settings["Table_Reference_Name"] ==
               self.__model_features_table)]
        variables_settings = variables_settings.reset_index()
        return variables_settings

    def __init_features_names(self) -> Dict:
        """Generate the features names, based on variable name, source table alias name (a.k.a. history table
            name), and the aggregation function name.
        :return: the name of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_names = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            if not pd.isnull(row["Variable_Aggregation"]):
                postfixes = row["Variable_Aggregation"].replace(' ',
                                                                '').split(',')
                for postfix in postfixes:
                    features_names[row["Table_History_Name"]].append(
                        row["Variable_Name"] + "_" + postfix)
            else:
                features_names[row["Table_History_Name"]].append(
                    row["Variable_Name"])
        return features_names

    def __init_features_dtypes(self) -> Dict:
        """Generate the features types, based on the input configuration file.
        :return: the dtypes of features.
        """
        self.__logger.debug(__name__)
        table_history_names = set(
            self.__variables_settings["Table_History_Name"])
        features_dtypes = dict(
            zip(table_history_names,
                [[] for _ in range(len(table_history_names))]))
        for _, row in self.__variables_settings.iterrows():
            feature_types = row["Variable_dType"].replace(' ', '').split(',')
            for feature_type in feature_types:
                features_dtypes[row["Table_History_Name"]].append(feature_type)
        return features_dtypes

    def __init_output(self, output_path: str, output_table: str):
        """Initialise the output file by writing the header row.
        :param output_path: the output path.
        :param output_table: the output table name.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        self.__readers_writers.reset_csv(output_path, output_table)
        self.__readers_writers.save_csv(output_path,
                                        output_table,
                                        features_names,
                                        append=False)

    def __init_prevalence(self, input_schemas: List, input_tables: List,
                          history_tables: List) -> Dict:
        """Generate the prevalence dictionary of values for all the variables.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :return: the prevalence dictionary of values for all the variables.
        """
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table,
            ["Feature Name", "Top Prevalence Feature Name"],
            append=False,
            ext="ini")
        self.__readers_writers.save_text(
            self.__output_path,
            self.__output_table, ["Feature Name", "Prevalence & Freq."],
            append=False,
            ext="txt")
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        prevalence = dict()

        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            prevalence[input_tables[table_i]] = dict()

            # for features
            for _, row in variables_settings.iterrows():
                self.__logger.info("Prevalence: " + row["Variable_Name"] +
                                   " ...")
                if not pd.isnull(row["Variable_Aggregation"]):
                    # read features
                    variables = self.__init_prevalence_read(
                        input_schemas[table_i], input_tables[table_i],
                        row["Variable_Name"])

                    # validate
                    if variables is None or len(variables) == 0:
                        continue

                    # prevalence
                    prevalence[input_tables[table_i]][row["Variable_Name"]] = \
                        feature_parser.prevalence(variables[row["Variable_Name"]], row["Variable_Name"])

                    # for sub features
                    postfixes = row["Variable_Aggregation"].replace(
                        ' ', '').split(',')
                    for p in range(len(postfixes)):
                        feature_name = row["Variable_Name"] + "_" + postfixes[p]
                        if len(postfixes[p]
                               ) > 11 and postfixes[p][0:11] == "prevalence_":
                            index = int(postfixes[p].split('_')[1]) - 1
                            feature_name_prevalence = "None"
                            if index < len(prevalence[input_tables[table_i]][
                                    row["Variable_Name"]]):
                                feature_name_prevalence = \
                                    feature_name + "_" + \
                                    str(prevalence[input_tables[table_i]][row["Variable_Name"]][index])
                            # save prevalence
                            self.__readers_writers.save_text(
                                self.__output_path,
                                self.__output_table,
                                [feature_name, feature_name_prevalence],
                                append=True,
                                ext="ini")
        return prevalence

    def __init_prevalence_read(self, input_schema: str, input_table: str,
                               variable_name: str) -> PandasDataFrame:
        """Read a variable from database, to calculate the prevalence of the values.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param variable_name: the variable name.
        :return: the selected variable.
        """
        query = "SELECT `" + variable_name + "` FROM `" + input_table + "`;"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __init_batch(self, input_schema: str, input_table: str) -> [int, int]:
        """Find the minimum and maximum value of the index column, to use when reading mysql tables in
            batches.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :return: the minimum and maximum of the index column.
        """
        self.__logger.debug(__name__)
        query = "select min(localID), max(localID) from `" + input_table + "`;"
        output = list(
            self.__readers_writers.load_mysql_query(query,
                                                    input_schema,
                                                    dataframing=False))
        if [r[0] for r in output][0] is None:
            self.__logger.error(__name__ + " No data is found: " + query)
            sys.exit()

        query_batch_start = int([r[0] for r in output][0])
        query_batch_max = int([r[1] for r in output][0])
        return query_batch_start, query_batch_max

    def __set_features_names_types(self):
        """Produce the sorted lists of features names and features dtypes.
        :return: the sorted lists of features names and features dtypes.
        """
        self.__logger.debug(__name__)
        keys = sorted(self.__features_dic_names.keys())
        features_names = [
            f for k in keys for f in self.__features_dic_names[k]
        ]
        features_dtypes = [
            pd.Series(dtype=f) for k in keys
            for f in self.__features_dic_dtypes[k]
        ]
        features_dtypes = pd.DataFrame(
            dict(zip(features_names, features_dtypes))).dtypes
        return features_names, features_dtypes

    def __set_batch(self, features_names: list, features_dtypes: Dict,
                    input_schemas: List, input_tables: List,
                    history_tables: List, column_index: str, prevalence: Dict,
                    query_batch_start: int, query_batch_max: int,
                    query_batch_size: int):
        """Using batch processing first read variables, then generate features and write them into output.
        :param features_names: the name of features that are selected.
        :param features_dtypes: the dtypes of features that are selected.
        :param input_schemas: the mysql database schemas.
        :param input_tables: the mysql table names.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        """
        self.__logger.debug(__name__)
        feature_parser = FeatureParser(self.__variables_settings,
                                       self.__output_path, self.__output_table)
        step = -1
        batch_break = False

        while not batch_break:
            step += 1
            features = None
            for table_i in range(len(input_schemas)):
                self.__logger.info("Batch: " + str(step) + "; Table: " +
                                   input_tables[table_i])

                # read job
                variables = self.__set_batch_read(input_schemas[table_i],
                                                  input_tables[table_i], step,
                                                  column_index,
                                                  query_batch_start,
                                                  query_batch_max,
                                                  query_batch_size)

                # validate
                if variables is None:
                    batch_break = True
                    break
                elif len(variables) == 0:
                    continue

                # process job
                if features is None:
                    features = pd.DataFrame(0,
                                            index=range(len(variables)),
                                            columns=features_names)
                    features = features.astype(dtype=features_dtypes)
                features = self.__set_batch_process(
                    feature_parser, history_tables[table_i], features,
                    variables, prevalence[input_tables[table_i]])

            # write job
            if features is not None:
                features = features.astype(dtype=features_dtypes)
                self.__set_batch_write(features)

    def __set_batch_read(
            self, input_schema: str, input_table: str, step: int,
            column_index: str, query_batch_start: int, query_batch_max: int,
            query_batch_size: int) -> Callable[[PandasDataFrame, None], None]:
        """Read the queried variables.
        :param input_schema: the mysql database schema.
        :param input_table: the mysql database table.
        :param step: the batch id.
        :param column_index: the name of index column (unique integer value) in the database table, which is used
            for batch reading the input.
        :param query_batch_start: the minimum value of the column index.
        :param query_batch_max: the maximum value of the column index.
        :param query_batch_size: the number of rows to be read in each batch.
        :return: the queried variables.
        """
        step_start = query_batch_start + step * query_batch_size
        step_end = step_start + query_batch_size
        if step_start >= query_batch_max:
            return None
        # read
        query = "SELECT * FROM `" + input_table + \
                "` WHERE `" + str(column_index) + "` >= " + str(step_start) + \
                " AND `" + str(column_index) + "` < " + str(step_end) + ";"
        return self.__readers_writers.load_mysql_query(query,
                                                       input_schema,
                                                       dataframing=True)

    def __set_batch_process(self, feature_parser: FeaturesFeatureParser,
                            history_table: str, features: PandasDataFrame,
                            variables: PandasDataFrame,
                            prevalence: List) -> PandasDataFrame:
        """Process variables and generate features.
        :param feature_parser:
        :param history_table: the source table alias name (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        :param features: the output features.
        :param variables: the input variables.
        :param prevalence: the prevalence dictionary of values for all the variables.
        :return: the generated features.
        """
        return feature_parser.generate(history_table, features, variables,
                                       prevalence)

    def __set_batch_write(self, features: PandasDataFrame):
        """Write the features into an output file.
        :param features: the output features.
        """
        self.__readers_writers.save_csv(self.__output_path,
                                        self.__output_table,
                                        features,
                                        append=True)

    def __validate_mysql_names(self, input_schemas: List,
                               history_tables: List):
        """Validate mysql tables and their columns, and generate exception if table/column name is invalid.
        :param input_schemas: the mysql database schemas.
        :param history_tables: the source tables' alias names (a.k.a. history table name) that features belong to
            (e.g. inpatient, or outpatient).
        """
        # for tables
        for table_i in range(len(input_schemas)):
            variables_settings = self.__variables_settings[
                self.__variables_settings["Table_History_Name"] ==
                history_tables[table_i]]
            # validate table name
            if not self.__readers_writers.exists_mysql(
                    input_schemas[table_i], history_tables[table_i]):
                self.__logger.error(__name__ + " - Table does not exist: " +
                                    history_tables[table_i])
                sys.exit()

            # for features
            for _, row in variables_settings.iterrows():
                # validate column name
                if not self.__readers_writers.exists_mysql_column(
                        input_schemas[table_i], history_tables[table_i],
                        row["Variable_Name"]):
                    self.__logger.error(__name__ +
                                        " - Column does not exist: " +
                                        row["Variable_Name"])
                    sys.exit()
示例#2
0
class PreProcess:
    def __init__(self, output_path: str):
        """Initialise the objects and constants.
        :param output_path:
        """
        self.__logger = logging.getLogger(CONSTANTS.app_name)
        self.__logger.debug(__name__)
        self.__output_path = output_path
        self.__readers_writers = ReadersWriters()

    def stats_discrete_df(self, df: PandasDataFrame, includes: List,
                          file_name: str) -> PandasDataFrame:
        """Calculate the odds ratio for all the features that are included and all the categorical states.
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param file_name: the name of the summary output file.
        :return: the summary output.
        """
        self.__logger.debug("Produce statistics for discrete features.")
        summaries = None
        self.__readers_writers.save_csv(path=self.__output_path,
                                        title=file_name,
                                        data=[],
                                        append=False)

        for f_name in includes:
            if f_name in df:
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=["Feature Name", f_name],
                                                append=True)
                summaries = stats.itemfreq(df[f_name])
                summaries = pd.DataFrame({
                    "value": summaries[:, 0],
                    "freq": summaries[:, 1]
                })
                summaries = summaries.sort_values("freq", ascending=False)
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=summaries,
                                                append=True,
                                                header=True)
        return summaries

    def stats_continuous_df(self, df: PandasDataFrame, includes: List,
                            file_name: str) -> PandasDataFrame:
        """Calculate the descriptive statistics for all the included continuous features.
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param file_name: the name of the summary output file.
        :return: the summary output.
        """
        self.__logger.debug("Produce statistics for continuous features.")
        summaries = None
        self.__readers_writers.save_csv(path=self.__output_path,
                                        title=file_name,
                                        data=[],
                                        append=False)

        for f_name in includes:
            if f_name in df:
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=["Feature Name", f_name],
                                                append=True)
                summaries = df[f_name].apply(pd.to_numeric).describe(
                    percentiles=[0.05, 0.25, 0.5, 0.75, 0.95]).transpose()
                summaries = pd.Series.to_frame(summaries).transpose()
                self.__readers_writers.save_csv(path=self.__output_path,
                                                title=file_name,
                                                data=summaries,
                                                append=True,
                                                header=True)
        return summaries

    def factoring_group_wise(self,
                             df: PandasDataFrame,
                             categories_dic: Dict,
                             labels_dic: Dict,
                             dtypes_dic: Dict,
                             threaded: bool = False) -> PandasDataFrame:
        """Categorise groups of features that are selected.
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :param dtypes_dic: the dictionary of the dtypes of the categorised features.
        :param threaded: indicates if it is multi-threaded.
        :return: the inputted dataframe with categorised features (if applicable).
        """
        self.__logger.debug("Categorise groups of features.")
        categories_dic = OrderedDict(categories_dic)

        if threaded is not True:
            pool_df_encoded = self.__factoring_group_wise_series(
                df, categories_dic, labels_dic)
        else:
            pool_df_encoded = self.__factoring_group_wise_threaded(
                df, categories_dic, labels_dic)

        # encoded labels
        labels_encoded = []
        for label_group in categories_dic.keys():
            labels_encoded += list(categories_dic[label_group].keys())

        # preserve types
        dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic}
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        for label in labels_encoded:
            del dtype_orig[label]

        # combine
        df = df.drop(labels_encoded, axis=1)
        df = pd.concat([df] + pool_df_encoded, axis=1)
        df = df.astype(dtype_orig)
        return df

    def __factoring_group_wise_series(self, df: PandasDataFrame,
                                      categories_dic: Dict,
                                      labels_dic: Dict) -> List:
        """Categorise a group of features that are selected (single-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise groups of features (single-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        pool_df_encoded = []

        try:
            for label_group in categories_dic.keys():
                pool_df_encoded.append(
                    factoring_thread.factor_arr_multiple(label_group))
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def __factoring_group_wise_threaded(self, df: PandasDataFrame,
                                        categories_dic: Dict,
                                        labels_dic: Dict) -> List:
        """Categorise a group of features that are selected (multi-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise groups of features (multi-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        try:
            with mp.Pool(processes=(mp.cpu_count() - 1)) as pool:
                pool_df_encoded = pool.map(
                    partial(factoring_thread.factor_arr_multiple),
                    categories_dic.keys())
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def factoring_feature_wise(self,
                               df: PandasDataFrame,
                               categories_dic: Dict,
                               labels_dic: Dict,
                               dtypes_dic: Dict,
                               threaded: bool = False) -> PandasDataFrame:
        """Categorise features that are selected.
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :param dtypes_dic: the dictionary of the dtypes of the categorised features.
        :param threaded: indicates if it is multi-threaded.
        :return: the inputted dataframe with categorised features (if applicable).
        """
        self.__logger.debug("Categorise.")
        categories_dic = OrderedDict(categories_dic)

        if threaded is not True:
            pool_df_encoded = self.__factoring_feature_wise_series(
                df, categories_dic, labels_dic)
        else:
            pool_df_encoded = self.__factoring_feature_wise_threaded(
                df, categories_dic, labels_dic)

        # encoded labels
        labels_encoded = list(categories_dic.keys())

        # preserve types
        dtype_orig = {**df.dtypes.to_dict(), **dtypes_dic}
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        for label in labels_encoded:
            del dtype_orig[label]

        # combine
        df = df.drop(labels_encoded, axis=1)
        df = pd.concat([df] + pool_df_encoded, axis=1)
        df = df.astype(dtype_orig)
        return df

    def __factoring_feature_wise_series(self, df: PandasDataFrame,
                                        categories_dic: Dict,
                                        labels_dic: Dict) -> List:
        """Categorise features that are selected (single-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise (single-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        pool_df_encoded = []

        try:
            for label_group in categories_dic.keys():
                pool_df_encoded.append(
                    factoring_thread.factor_arr(label_group))
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def __factoring_feature_wise_threaded(self, df: PandasDataFrame,
                                          categories_dic: Dict,
                                          labels_dic: Dict) -> List:
        """Categorise features that are selected (multi-threaded).
        :param df: the features dataframe.
        :param categories_dic: the dictionary of the categorical states for the included features.
        :param labels_dic: the dictionary of the features names of the categorised features.
        :return: the categorised features.
        """
        self.__logger.debug("Categorise (multi-threaded).")
        factoring_thread = FactoringThread(df, categories_dic, labels_dic)
        try:
            with mp.Pool() as pool:
                pool_df_encoded = pool.map(
                    partial(factoring_thread.factor_arr),
                    categories_dic.keys())
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()
        return pool_df_encoded

    def transform_df(self,
                     df: PandasDataFrame,
                     excludes: List,
                     transform_type: str,
                     threaded: bool = False,
                     method_args: Dict = None,
                     **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar',
        'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox')
        :param threaded: indicates if it is multi-threaded.
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the inputted dataframe with transformed features (if applicable).
        """
        self.__logger.info("Transform Features.")
        excludes = set(excludes)
        includes = [
            label for label in df.columns.values if label not in excludes
        ]
        method_args = dict() if method_args is None else method_args

        # preserve types
        dtype_orig = df.dtypes.to_dict()
        for label in includes:
            dtype_orig[label] = 'f8'
        dtype_orig = pd.DataFrame(dtype_orig, index=[0]).dtypes
        df = df.astype(dtype_orig)

        # transform
        if threaded is False:
            df, method_args = self.__transform_df_series(
                df, includes, transform_type, **kwargs)
        else:
            df, method_args = self.__transform_df_threaded(
                df, includes, transform_type, method_args, **kwargs)
        return df, method_args

    def __transform_df_series(self,
                              df: PandasDataFrame,
                              includes: List,
                              transform_type: str,
                              method_args: Dict = None,
                              **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method (single-threaded).
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param transform_type: the transformation type (options: 'scale', 'robust_scale', 'max_abs_scalar',
        'normalizer', 'kernel_centerer', 'yeo_johnson', 'box_cox')
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the transformed feature.
        """
        self.__logger.debug("Transform features (single-threaded).")
        transform_thread = TransformThread(**kwargs)
        method_args = dict() if method_args is None else method_args

        try:
            if transform_type == "scale":
                for name in includes:
                    transform_thread.transform_scale_arr(df, method_args, name)
            elif transform_type == "robust_scale":
                for name in includes:
                    transform_thread.transform_robust_scale_arr(
                        df, method_args, name)
            elif transform_type == "max_abs_scalar":
                for name in includes:
                    transform_thread.transform_max_abs_scalar_arr(
                        df, method_args, name)
            elif transform_type == "normalizer":
                for name in includes:
                    transform_thread.transform_normalizer_arr(
                        df, method_args, name)
            elif transform_type == "kernel_centerer":
                for name in includes:
                    transform_thread.transform_kernel_centerer_arr(
                        df, method_args, name)
            elif transform_type == "yeo_johnson":
                for name in includes:
                    transform_thread.transform_yeo_johnson_arr(
                        df, method_args, name)
            elif transform_type == "box_cox":
                for name in includes:
                    transform_thread.transform_box_cox_arr(
                        df, method_args, name)
            else:
                raise Exception(transform_type)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        return df, method_args

    def __transform_df_threaded(self,
                                df: PandasDataFrame,
                                includes: List,
                                transform_type: str,
                                method_args: Dict = None,
                                **kwargs: Any) -> [PandasDataFrame, Dict]:
        """Transform the included features, using the selected and configured method (multi-threaded).
        :param df: the features dataframe.
        :param includes: the name of included features.
        :param transform_type: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param method_args: the transformation arguments, which needs to preserved if it is applied to more than
        one data set.
        :param kwargs: the input argument for the selected transformation function.
        :return: the transformed feature.
        """
        self.__logger.debug("Transform features (multi-threaded).")
        manager = mp.Manager()
        dt = manager.dict(
            list(zip(df[includes].columns, df[includes].T.values.tolist())))
        transform_thread = TransformThread(**kwargs)
        method_args = dict() if method_args is None else method_args

        # run
        try:
            with mp.Pool(processes=(mp.cpu_count() - 1)) as pool:
                if transform_type == "scale":
                    pool.map(
                        partial(transform_thread.transform_scale_arr, dt,
                                method_args), includes)
                elif transform_type == "robust_scale":
                    pool.map(
                        partial(transform_thread.transform_robust_scale_arr,
                                dt, method_args), includes)
                elif transform_type == "max_abs_scalar":
                    pool.map(
                        partial(transform_thread.transform_max_abs_scalar_arr,
                                dt, method_args), includes)
                elif transform_type == "normalizer":
                    pool.map(
                        partial(transform_thread.transform_normalizer_arr, dt,
                                method_args), includes)
                elif transform_type == "kernel_centerer":
                    pool.map(
                        partial(transform_thread.transform_kernel_centerer_arr,
                                dt, method_args), includes)
                elif transform_type == "yeo_johnson":
                    pool.map(
                        partial(transform_thread.transform_yeo_johnson_arr, dt,
                                method_args), includes)
                elif transform_type == "box_cox":
                    pool.map(
                        partial(transform_thread.transform_box_cox_arr, dt,
                                method_args), includes)
                else:
                    raise Exception(transform_type)
        except ValueError as exception:
            self.__logger.error(__name__ + " - Invalid configuration(s): " +
                                str(exception))
            sys.exit()

        # set
        for k, v in dt.items():
            df[k] = v

        return df, method_args

    def high_linear_correlation_df(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_corr_cut: float = 0.95,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected highly linearly correlated features.
        The Pearson correlation coefficient was calculated for all the pair of variables to measure linear dependence
        between them.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_corr_cut: the numeric value for the pair-wise absolute correlation cutoff. e.g. 0.95.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with high linear correlation (if applicable).")
        corr = None
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        summaries = OrderedDict()

        # search
        if to_search is True:
            corr = df[[col for col in df.columns
                       if col not in excludes]].corr(method='pearson')
            for label in corr.columns.values:
                matches_temp = list(
                    corr[abs(corr[label]) >= thresh_corr_cut].index)
                if len(matches_temp) > 1:
                    # set matches
                    try:
                        matches_temp.remove(label)
                    except ValueError and AttributeError:
                        pass  # not in some-list! OR not behaving like a list!
                    matches = np.union1d(matches, matches_temp)

                    # summaries
                    for match in matches_temp:
                        if match in summaries.keys():
                            matches_temp.remove(match)
                    if len(matches_temp) > 0:
                        summaries[label] = matches_temp
                        self.__logger.info("High Linear Correlation: " +
                                           label + " ~ " + str(matches_temp))

        # delete
        df = self.__remove(
            df, summaries, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
            summaries["Correlation Matrix"] = corr
        return df, summaries

    def near_zero_var_df_sklearn(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_variance: float = 0.05,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected near-zero-variance features (Scikit algorithm).
        Feature selector that removes all low-variance features.
        This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be
        used for unsupervised learning.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_variance: Features with a training-set variance lower than this threshold will be removed.
        The default is to keep all features with non-zero variance, i.e. remove the features that have the same
        value in all samples.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with near-zero-variance (if applicable), using Scikit algorithm."
        )
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        indices = OrderedDict()
        summaries = OrderedDict()

        # find indices
        for label in df.columns.values():
            indices[df.columns.get_loc(label)] = label

        # search
        if to_search is True:
            variances_ = feature_selection.VarianceThreshold(thresh_variance)
            matches_indices = variances_.get_support(indices=True)
            matches_labels = [indices[index] for index in matches_indices]
            for match in matches_labels:
                if match not in excludes:
                    matches += [match]

        # delete
        df = self.__remove(
            df, {'NZV': list(matches)}, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
        return df, summaries

    def near_zero_var_df(
            self,
            df: PandasDataFrame,
            excludes: List,
            file_name: str,
            thresh_unique_cut: float = 100,
            thresh_freq_cut: float = 1000,
            to_search: bool = True
    ) -> [PandasDataFrame, CollectionsOrderedDict]:
        """Find and optionally remove the selected near-zero-variance features (custom algorithm).
        The features that had constant counts less than or equal a threshold may be filtered out,
        to exclude highly constants and near-zero variances.
        Rules are as the following:
         - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be
           greater than a threshold;
         - Percent of unique values: The number of unique values divided by the total number of samples to be greater
           than the threshold.
        :param df: the features dataframe.
        :param excludes: the name of excluded features.
        :param file_name: the name of the summary output file.
        :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples
        (upper limit). e.g. 10 * 100 / 100.
        :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value
        (lower limit). e.g. 95/5.
        :param to_search: to search or use the saved configuration.
        :return: the inputted dataframe with exclusion of features that were selected to be removed.
        """
        self.__logger.debug(
            "Remove features with near-zero-variance (if applicable), using custom algorithm."
        )
        df_excludes = df[excludes]
        excludes = set(excludes)
        matches = []
        summaries = OrderedDict()

        # search
        if to_search is True:
            for label in df.columns.values:
                # set match and summaries
                # check of NaN
                if not isinstance(df[label].iloc[0], (int, np.int, float, np.float)) \
                        or np.isnan(np.sum(df[label])):
                    matches += [label]
                    continue
                # check of near zero variance
                match, summaries[label] = self.__near_zero_var(
                    df[label], label, excludes, thresh_unique_cut,
                    thresh_freq_cut)
                if match is True:
                    matches += [label]
                    self.__logger.info("Near Zero Variance: " + label)

        # to_remove
        df = self.__remove(
            df, {'NZV': list(matches)}, to_search,
            os.path.join(self.__output_path, file_name + ".ini"))
        for name in excludes:
            df[name] = df_excludes[name]
        if any(np.isnan(df.index)):
            df = df.reset_index(drop=True)

        # summaries
        if to_search is True:
            summaries["Features Matches"] = matches
        return df, summaries

    def __near_zero_var(self, arr: List, label: str, excludes: set,
                        thresh_unique_cut: float,
                        thresh_freq_cut: float) -> [bool, Dict]:
        """Assess a single feature for near-zero-variance (custom algorithm).
        The features that had constant counts less than or equal a threshold may be filtered out,
        to exclude highly constants and near-zero variances.
        Rules are as the following:
         - Frequency ratio: The frequency of the most prevalent value over the second most frequent value to be
           greater than a threshold;
         - Percent of unique values: The number of unique values divided by the total number of samples to be greater
           than the threshold.

        :param arr: the feature value.
        :param label: the feature name.
        :param excludes: the name of excluded features.
        :param thresh_unique_cut: the cutoff for the percentage of distinct values out of the number of total samples
        (upper limit). e.g. 10 * 100 / 100.
        :param thresh_freq_cut: the cutoff for the ratio of the most common value to the second most common value
        (lower limit). e.g. 95/5.
        :return: indicates if the feature has near-zero-variance.
        """
        self.__logger.debug(
            "Find near-zero-variance (if applicable), using custom algorithm.")
        unique, counts = np.unique(arr, return_counts=True)
        if len(counts) == 1:
            return True, {'unique': list(unique), 'counts': list(counts)}
        else:
            counts = sorted(counts, reverse=True)
            if label not in excludes and (len(unique) * 100) / float(
                    len(arr)) > thresh_unique_cut:
                return True, {'unique': list(unique), 'counts': list(counts)}
            if label not in excludes and counts[0] / float(
                    counts[1]) > thresh_freq_cut:
                return True, {'unique': list(unique), 'counts': list(counts)}
            else:
                return False, {'unique': list(unique), 'counts': list(counts)}

    def __remove(self,
                 df: PandasDataFrame,
                 dict_matches: Dict,
                 to_search: bool,
                 path: str,
                 section: str = "features") -> PandasDataFrame:
        """Confirm removals and if confirmed, then re-read the selected features, then remove
        :param df: the features dataframe.
        :param dict_matches: the matched features.
        :param to_search: to search or use the saved configuration.
        :param path: the file path to the configuration file.
        :param section: the section name in the configuration file.
        :return: the updated features.
        """
        self.__logger.debug("Confirm removals and implement removal process.")
        config = PyConfigParser(path, CONSTANTS.app_name)

        if to_search is True:
            # write to config
            config.reset()
            config.write_dict(dict_matches, section)
            # confirm
            response = self.__readers_writers.question_overwrite(
                "the features defined in the following file to be removed: " +
                path)
            if response is False:
                config.reset()
                return df

        # if to_search is False or response was yes then read from config
        config.refresh()
        dict_matches = config.read_dict(section)

        # remove
        self.__logger.debug("The feature removal list: " +
                            ",".join(dict_matches))
        labels = [
            label for label_group in dict_matches.values()
            for label in label_group if label in df
        ]
        if len(labels) > 0:
            df = df.drop(labels, axis=1)
        return df
示例#3
0
cmdLearner = "set \"PATH=%PATH%;" + os.path.abspath(
    "Libraries/Infer.NET_2.6/Bin/") + "\" && Learner"

# ## 4.1. Prepare Inputs

# Write the selected features for submodels into CSV files, to be used for training and testing using Infer.Net

# In[ ]:

for sample in input_files.keys():
    for submodel in input_files[sample].keys():
        readers_writers.save_csv(
            data=features_input[sample][submodel].drop(['hesid'], axis=1),
            path=io_path,
            title=input_files[sample][submodel] + "_InferDotNet",
            append=False,
            ext="csv",
            header=features_input[sample][submodel].drop(['hesid'],
                                                         axis=1).columns)

# Convert the CSV to correct format, using the Infer.Net <a href="http://infernet.azurewebsites.net/docs/Infer.NET%20Learners%20-%20Matchbox%20recommender%20-%20Command-line%20runners.aspx">guideline</a>.

# In[ ]:

for sample in input_files.keys():
    for submodel in input_files[sample].keys():
        # execute shell commands
        cmd1 = os.path.join(io_path,
                            input_files[sample][submodel] + "_InferDotNet.csv")
        cmd2 = os.path.abspath("ReadersWriters\script_vim.txt")
        get_ipython().system('vim $cmd1 -S $cmd2')