class XGBoostFeatureReduction:
    """
    A class that allows a number of features to be selected.
    This used Unsupervised Learning in the form of BayesianGaussianMixture and
    Unsupervised RandomForestClassifier.
    """

    bgmm = BayesianGaussianMixtureWrapper()

    def __init__(self):
        pass

    @staticmethod
    def get_feature_list(incoming_df, n_features=None):
        """
        Uses an Unsupervised XGBClassifier with a sample generated data that is
        marked as synthetic, allowing the XGBClassifier to learn the data features.
        A list of features is returned sorted by importance.
        :param incoming_df:
        :param n_features:
        :param treatment:
        :param outcome:
        :return:
        """

        if n_features is None:
            # set to the number of columns in the df
            n_features = len(list(incoming_df))

        x, y = XGBoostFeatureReduction.bgmm.get_synthetic_training_data(
            incoming_df)

        # Create an unsupervised random forest classifier
        clf = XGBClassifier(n_samples=1000,
                            n_features=n_features,
                            random_state=42,
                            verbosity=0)

        # Train the classifier
        clf.fit(x, y)

        # sort the feature indexes and return
        features = np.argsort(clf.feature_importances_)[::-1]

        return list(features[:n_features])
class SGDClassifierFeatureReduction(object):
    """
    A class that allows a number of features to be selected.
    This uses Unsupervised Learning in the form of LinearRegression.
    """

    bgmm = BayesianGaussianMixtureWrapper()

    def __init__(self, ):
        pass

    @staticmethod
    def get_feature_list(incoming_df, n_features=None):
        """
        Returns a reduced list of features.
        :param incoming_df:
        :param n_features:
        :return:
        """

        # define the model
        model = SGDClassifier()

        # get ths synthetic data
        x, y = SGDClassifierFeatureReduction.bgmm.get_synthetic_training_data(
            incoming_df)

        # fit the model
        model.fit(x, y)

        # get importance
        coefs = model.coef_

        # sort the feature indexes and return
        features = np.argsort(coefs)[::-1]

        # flatten nested list
        features = features[0]

        if n_features is None:
            # set to the number of columns in the df
            n_features = len(list(incoming_df))

        return list(features[:n_features])
class RecursiveFeatureElimination:
    """
    Feature ranking with recursive feature elimination.
    """

    bgmm = BayesianGaussianMixtureWrapper()

    def __init__(self):
        pass

    @staticmethod
    def get_feature_list(incoming_df, n_features=None):
        """
        Uses an Unsupervised GradientBoostingClassifier with a sample generated data that is
        marked as synthetic, allowing the RandomForestClassifier to learn the data features.
        A list of features is returned sorted by importance.
        :param incoming_df:
        :param n_features:
        :param treatment:
        :param outcome:
        :return:
        """

        if n_features is None:
            # set to the number of columns in the df
            n_features = len(list(incoming_df))

        x, y = RecursiveFeatureElimination.bgmm.get_synthetic_training_data(incoming_df)

        # Create an rfe
        rfe = RFE(estimator=GradientBoostingClassifier(), n_features_to_select=n_features)

        # Train the classifier
        rfe.fit(x, y)

        # sort the feature indexes and return
        features = []

        for i in range(x.shape[1]):
            # see if column has been marked true or false
            if rfe.support_[i]:
                features.append(i)

        return features
Пример #4
0
class App:
    """
    The main AitiaExplorer app entry point.
    """

    algo_runner = AlgorithmRunner()
    feature_selection = FeatureSelectionRunner()
    graph_metrics = GraphMetrics()
    graph_util = GraphUtil()
    data = TargetData()
    bgmm = BayesianGaussianMixtureWrapper()

    def __init__(self):
        self.vm_running = False

    def run_analysis_with_high_low(self,
                                   incoming_df,
                                   target_graph_str=None,
                                   feature_high=None,
                                   feature_low=None,
                                   feature_selection_list=None,
                                   algorithm_list=None,
                                   pc=None,
                                   verbose=True
                                   ):
        """
        Runs the entire analysis with feature selection and causal discovery between a high and low range
        of features. Returns the best results in a separate dataframe.

        :param incoming_df: dataframe
        :param target_graph_str: string in dot format
        :param feature_high: number of features range end
        :param feature_low: number of features range start
        :param feature_selection_list: list of feature selection algorithms
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :param verbose: verbose boolean
        :return: tuple:
            (AnalysisResults obj,
            best result dataframe,
            target graph (approximated or oetherwise),
            all results dataframe)
        """
        if feature_high is None:
            # just default to number of features in dataframe
            feature_high = len(list(incoming_df))

        if feature_high is None:
            # just default to 1
            feature_low = 1

        if target_graph_str is None:
            # no target graph has been supplied, so let's create an approximation
            # using the hill climbing algorithm
            if verbose:
                print("No target graph has been supplied.")
                print("The system will generate an approximate target graph using the greedy hill climbing algorithm.")
            target_graph_str = self.algo_runner.algo_hill_climber(incoming_df)

        # to store all the results
        results_dict = dict()
        all_results_df = pd.DataFrame()

        # this is just so we can pass it back
        returned_target_graph = None

        for i in range(feature_high, feature_low, -1):

            if verbose:
                print("-----------------------------------------------")
                print("Starting analysis with {0} features...".format(i))

            # get current run results
            result_obj, results_df, returned_target_graph = self.run_analysis(
                incoming_df,
                target_graph_str=target_graph_str,
                n_features=i,
                feature_selection_list=feature_selection_list,
                algorithm_list=algorithm_list,
                pc=pc,
                verbose=verbose)

            results_dict[i] = (result_obj, results_df)
            all_results_df = all_results_df.append(results_df, ignore_index=True)

            if verbose:
                print("Completed analysis with {0} features...".format(i))

        # now we need to figure out the lowest SHD
        shd_results_dict = dict()

        # get the minimum shd for each run
        for k, v in results_dict.items():
            # dict holds a tuple, second value is df
            results_df = v[1]
            minimum_shd = results_df['SHD'].min()
            shd_results_dict[k] = minimum_shd

        # sort the shd
        shd_tuple_list = sorted(shd_results_dict.items(), key=lambda x: x[1])

        # first results are the best, first value in tuple is feature no / index
        i = shd_tuple_list[0][0]

        if verbose:
            print("All done!")
            print("The results with the lowest SHD have been returned.")

        # return the results from the results dict
        # --> results_obj, result_df, target_graph, all_results_df
        return results_dict[i][0], results_dict[i][1], returned_target_graph, all_results_df

    def run_analysis(self,
                     incoming_df,
                     target_graph_str=None,
                     n_features=None,
                     feature_selection_list=None,
                     algorithm_list=None,
                     pc=None,
                     verbose=True):
        """
        Runs the entire analysis with feature selection and causal discovery.
        Takes a specific number of features to return.

        :param incoming_df: dataframe
        :param target_graph_str: string in dot format
        :param n_features: number of features to select (defaults to all if None supplied)
        :param feature_selection_list: list of feature selection algorithms
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :param verbose: verbose boolean
        :return: tuple:
            (AnalysisResults obj,
            all results dataframe,
            target graph (approximated or oetherwise))
        """

        if n_features is None:
            # just default to number of features in dataframe
            n_features = len(list(incoming_df))

        if target_graph_str is None:
            # no target graph has been supplied, so let's create an approximation
            # using the hill climbing algorithm
            if verbose:
                print("No target graph has been supplied.")
                print("The system will generate an approximate target graph using the greedy hill climbing algorithm.")
            target_graph_str = self.algo_runner.algo_hill_climber(incoming_df)

        feature_selection_list = self._get_feature_selection_algorithms(feature_selection_list)

        amalgamated_analysis_results = []

        for feature_selection in feature_selection_list:
            # get the actual function
            feature_func = feature_selection[1]

            # get the feature list from the function
            features = feature_func(incoming_df, n_features)

            if verbose:
                print("Running causal discovery on features selected by {0}".format(feature_selection[0]))

            # get the reduced dataframe
            df_reduced, requested_features = self.get_reduced_dataframe(incoming_df, features)

            # check to see if this reduced dataframe has introduced unobserved latent edges
            latent_edges = []
            latent_edges.extend(self.algo_runner.algo_miic(df_reduced))

            if verbose:
                print("There are {0} latent edges in the reduced dataset".format(len(latent_edges)))

            analysis_results = self._run_causal_algorithms(df_reduced,
                                                           feature_selection_method=feature_selection[0],
                                                           requested_features=requested_features,
                                                           n_features=n_features,
                                                           target_graph_str=target_graph_str,
                                                           algorithm_list=algorithm_list,
                                                           latent_edges=latent_edges,
                                                           pc=pc,
                                                           verbose=verbose)

            if verbose:
                print("Completed causal discovery on features selected by {0}".format(feature_selection[0]))

            amalgamated_analysis_results.append(analysis_results)

        if verbose:
            print("Completed analysis.")

        # we need to flatten all the results
        amalgamated_list_of_dicts = []
        final_results = []
        for results in amalgamated_analysis_results:
            for result in results.results:
                # append as dict for the dataframe output
                amalgamated_list_of_dicts.append(result.asdict())
                # flatten the results
                final_results.append(result)

        # generate the target graph for the user
        target_graph = self.graph_util.get_causal_graph_from_dot(target_graph_str)

        return final_results, pd.DataFrame(amalgamated_list_of_dicts), target_graph

    def run_causal_discovery(self, df, target_graph_str, algorithm_list, pc):
        """
        Runs the causal discovery.
        :param df: dataframe
        :param target_graph_str: string in dot format
        :param algorithm_list: list of causals discovery algorithms
        :param pc: py-causal object for java vm communication
        :return: tuple:
            (AnalysisResults obj,
            all results dataframe)
        """
        analysis_results = self._run_causal_algorithms(df,
                                                       target_graph_str=target_graph_str,
                                                       algorithm_list=algorithm_list,
                                                       pc=pc)
        return analysis_results, analysis_results.to_dataframe()

    def get_reduced_dataframe(self, incoming_df, feature_indices, sample_with_gmm=False):
        """
        A wrapper call for the BayesianGaussianMixtureWrapper :)
        """
        bgmm = BayesianGaussianMixtureWrapper()
        return bgmm.get_reduced_dataframe(incoming_df, feature_indices, sample_with_gmm)

    def _run_causal_algorithms(self,
                               incoming_df,
                               requested_features=None,
                               feature_selection_method=None,
                               n_features=None,
                               algorithm_list=None,
                               target_graph_str=None,
                               latent_edges=[],
                               pc=None,
                               verbose=True):
        """
        Internal. Runs an analysis on the supplied dataframe.
        This can take a PyCausalWrapper if multiple runs are being done.
        """
        analysis_results = AnalysisResults()
        pc_supplied = True

        # get py-causal if needed
        if pc is None:
            pc_supplied = False
            pc = pycausal()
            pc.start_vm()

        algo_list = self._get_causal_algorithms(algorithm_list)

        for algo in algo_list:
            # dict to store run result
            analysis_result = SingleAnalysisResult()
            analysis_result.feature_selection_method = feature_selection_method
            analysis_result.feature_list = requested_features
            analysis_result.num_features_requested = n_features
            analysis_result.causal_algorithm = algo[0]
            analysis_result.latent_edges = latent_edges

            if verbose:
                print("Running causal discovery using {0}".format(algo[0]))

            # get the graph from the algo
            algo_fn = algo[1]
            dot_str = self._discover_graph(algo_fn, incoming_df, pc)

            # store the dot graph
            analysis_result.dot_format_string = dot_str

            # convert the causal graph
            if dot_str is not None:
                causal_graph = self.graph_util.get_causal_graph_from_dot(dot_str)
                analysis_result.causal_graph = causal_graph
                nx_graph = self.graph_util.get_digraph_from_dot(dot_str)
                analysis_result.causal_graph_with_latent_edges = \
                    self.graph_util.get_causal_graph_with_latent_edges(nx_graph, latent_edges)

            analysis_results.results.append(analysis_result)

        # shutdown the java vm if needed
        if not pc_supplied:
            pc.stop_vm()

        # filter the results
        analysis_results_filtered = self._filter_empty_results(analysis_results)

        # add the causal metrics
        updated_analysis_results = self._add_causal_metrics(analysis_results_filtered, target_graph_str)

        return updated_analysis_results

    def _discover_graph(self, algo_fn, df, pc):
        """
        Siscover the graph using the supplied algorithm function.
        """
        dot_str = algo_fn(df, pc)
        return dot_str

    def _filter_empty_results(self, incoming_results):
        filtered_results = AnalysisResults()
        for result in incoming_results.results:
            if result.causal_graph is not None:
                filtered_results.results.append(result)
        return filtered_results

    def _get_feature_selection_algorithms(self, feature_selection_list):
        """
        Gets the list of feature selection algorithms to run.
        """
        algo_list = feature_selection_list
        if algo_list is None:
            algo_list = self.feature_selection.get_all_feature_selection_algorithms()
        return algo_list

    def _get_causal_algorithms(self, algorithm_list):
        """
        Gets the list of causal algorithms to run.
        """
        algo_list = algorithm_list
        if algo_list is None:
            algo_list = self.algo_runner.get_all_causal_algorithms()
        return algo_list

    def _add_causal_metrics(self, incoming_analysis_results, target_graph_str):
        """
        Provides the causal analysis results.
        """
        return_analysis_results = AnalysisResults()
        target_nxgraph = None
        if target_graph_str is not None:
            target_nxgraph = self.graph_util.get_nxgraph_from_dot(target_graph_str)

        for result in incoming_analysis_results.results:
            if result.dot_format_string is not None \
                    and result.causal_graph is not None:
                pred_graph = self.graph_util.get_nxgraph_from_dot(result.dot_format_string)
                if target_nxgraph is not None:
                    prec_recall = self.graph_metrics.precision_recall(target_nxgraph, pred_graph)[0]
                    shd = self.graph_metrics.SHD(target_nxgraph, pred_graph)
                else:
                    prec_recall = 0
                    shd = 0
                result.AUPRC = prec_recall
                result.SHD = shd
            return_analysis_results.results.append(result)
        return return_analysis_results
Пример #5
0
 def get_reduced_dataframe(self, incoming_df, feature_indices, sample_with_gmm=False):
     """
     A wrapper call for the BayesianGaussianMixtureWrapper :)
     """
     bgmm = BayesianGaussianMixtureWrapper()
     return bgmm.get_reduced_dataframe(incoming_df, feature_indices, sample_with_gmm)
class Test_Feature_Reduction(TestAPI):
    """
    Tests for feature reduction.
    """

    runner = FeatureSelectionRunner()

    bgmm = BayesianGaussianMixtureWrapper()

    def setUp(self):
        pass

    def tearDown(self):
        pass

    def test_randomforest_feature_reduction(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_list = FeatureSelectionRunner.random_forest_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data, feature_list)
        self.assertTrue(df_reduced is not None)

    def test_pfa(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_indices = FeatureSelectionRunner.pfa_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                     feature_indices)
        self.assertTrue(df_reduced is not None)

    def test_linear_regression(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_indices = FeatureSelectionRunner.linear_regression_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                     feature_indices)
        self.assertTrue(df_reduced is not None)

    def test_sgdclassifier(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_indices = FeatureSelectionRunner.sgdclassifier_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                     feature_indices)
        self.assertTrue(df_reduced is not None)

    def test_xgboost(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_indices = FeatureSelectionRunner.xgboost_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                     feature_indices)
        self.assertTrue(df_reduced is not None)

    def test_rfe(self):
        hepart_data = TargetData.hepar2_100_data()
        self.assertTrue(hepart_data is not None, "No data loaded.")
        feature_indices = FeatureSelectionRunner.rfe_feature_reduction(
            hepart_data, 10)
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data,
                                                     feature_indices)
        self.assertTrue(df_reduced is not None)
Пример #7
0
class Test_App(TestAPI):
    """
    Tests for the aitia_explorer app.
    """
    bgmm = BayesianGaussianMixtureWrapper()

    def setUp(self):
        self.data_dir = os.path.join(os.path.dirname(__file__), 'resources/data')

    def tearDown(self):
        pass

    def test_scm_load(self):
        aitia = App()
        scm1 = aitia.data.scm1()
        target_graph_str = str(scm1.cgm.draw())
        df = scm1.sample(1000)
        self.assertTrue(target_graph_str is not None)
        self.assertTrue(df is not None)

    def test_run_causal_analysis(self):
        pc = pycausal()
        pc.start_vm()
        aitia = App()
        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")
        # just need a test graph
        dot_str = aitia.algo_runner.algo_pc(df, pc)
        # algo list
        algorithm_list = []
        algorithm_list.append(aitia.algo_runner.PC)
        algorithm_list.append(aitia.algo_runner.FCI)
        analysis_results = aitia._run_causal_algorithms(df,
                                                        algorithm_list=algorithm_list,
                                                        target_graph_str=dot_str,
                                                        pc=pc)
        self.assertTrue(analysis_results is not None)

    def test_run_full_analysis(self):
        # setup
        pc = pycausal()
        pc.start_vm()
        aitia = App()
        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")

        # just need a test graph
        dot_str = aitia.algo_runner.algo_pc(df, pc)

        # feature selection algos
        feature_selection_list = []
        feature_selection_list.append(aitia.feature_selection.LINEAR_REGRESSION)
        feature_selection_list.append(aitia.feature_selection.PRINCIPAL_FEATURE_ANALYSIS)

        # causal algo list
        algorithm_list = []
        algorithm_list.append(aitia.algo_runner.PC)
        algorithm_list.append(aitia.algo_runner.FCI)
        analysis_results, summary_df, _ = aitia.run_analysis(df,
                                                          target_graph_str=dot_str,
                                                          n_features=4,
                                                          feature_selection_list=feature_selection_list,
                                                          algorithm_list=algorithm_list,
                                                          pc=pc)
        self.assertTrue(analysis_results is not None)
        self.assertTrue(summary_df is not None)

    def test_run_analysis_with_high_low(self):
        # setup
        pc = pycausal()
        pc.start_vm()
        aitia = App()
        data_dir = os.path.join(self.data_dir, "charity.txt")
        df = pd.read_table(data_dir, sep="\t")

        # just need a test graph
        dot_str = aitia.algo_runner.algo_pc(df, pc)

        # feature selection algos
        feature_selection_list = []
        feature_selection_list.append(aitia.feature_selection.LINEAR_REGRESSION)
        feature_selection_list.append(aitia.feature_selection.PRINCIPAL_FEATURE_ANALYSIS)

        # causal algo list
        algorithm_list = []
        algorithm_list.append(aitia.algo_runner.PC)
        algorithm_list.append(aitia.algo_runner.FCI)
        analysis_results, summary_df, _ = aitia.run_analysis_with_high_low(df,
                                                          target_graph_str=dot_str,
                                                          feature_high=5,
                                                          feature_low=3,
                                                          feature_selection_list=feature_selection_list,
                                                          algorithm_list=algorithm_list,
                                                          pc=pc)
        self.assertTrue(analysis_results is not None)
        self.assertTrue(summary_df is not None)

    def test_all_returned_features(self):
        feature_set = set()
        hepart_data = TargetData.hepar2_100_data()
        feature_set.update(FeatureSelectionRunner.random_forest_feature_reduction(hepart_data, 10))
        feature_set.update(FeatureSelectionRunner.pfa_feature_reduction(hepart_data, 10))
        feature_set.update(FeatureSelectionRunner.linear_regression_feature_reduction(hepart_data, 10))
        feature_set.update(FeatureSelectionRunner.xgboost_feature_reduction(hepart_data, 10))
        feature_set.update(FeatureSelectionRunner.rfe_feature_reduction(hepart_data, 10))
        df_reduced = self.bgmm.get_reduced_dataframe(hepart_data, list(feature_set))
        self.assertTrue(df_reduced is not None)