Exemplo n.º 1
0
    def getCIGroups(local_data, ds_context=None, scope=None, alpha=0.001, families=None):
        """
        :param local_data: np array
        :param scope: a list of index to output variables
        :param alpha: threshold
        :param families: obsolete
        :return: np array of clustering

        This function take tuple (output, conditional) as input and returns independent groups
        alpha is the cutoff parameter for connected components
        BE CAREFUL WITH SPARSE DATA!
        """

        data = preproc(local_data, ds_context, None, ohe)

        num_instance = data.shape[0]

        output_mask = np.zeros(data.shape, dtype=bool)  # todo check scope and node.scope again
        output_mask[:, np.arange(len(scope))] = True

        dataOut = data[output_mask].reshape(num_instance, -1)
        dataIn = data[~output_mask].reshape(num_instance, -1)

        assert len(dataIn) > 0
        assert len(dataOut) > 0

        pvals = testRcoT(dataOut, dataIn)

        pvals[pvals > alpha] = 0

        clusters = np.zeros(dataOut.shape[1])
        for i, c in enumerate(connected_components(from_numpy_matrix(pvals))):
            clusters[list(c)] = i + 1

        return split_conditional_data_by_clusters(local_data, clusters, scope, rows=False)
Exemplo n.º 2
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 3
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        from sklearn.cluster import KMeans
        km_model = KMeans(n_clusters=n_clusters, random_state=seed)
        clusters = km_model.fit_predict(data)
        return split_data_by_clusters(local_data, clusters, scope,
                                      rows=True), km_model
Exemplo n.º 4
0
    def split_conditional_rows_KMeans(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, ohe)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed,
                          precompute_distances=True).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 5
0
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)
        kmeans_data = TSNE(n_components=3,
                           verbose=verbose,
                           n_jobs=ncpus,
                           random_state=seed).fit_transform(data)
        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed).fit_predict(kmeans_data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 6
0
    def split_rows_Gower(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, False)

        try:
            df = robjects.r["as.data.frame"](data)
            clusters = robjects.r["mixedclustering"](df, ds_context.distribution_family, n_clusters, seed)
            clusters = np.asarray(clusters)
        except Exception as e:
            np.savetxt("/tmp/errordata.txt", local_data)
            print(e)
            raise e

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 7
0
    def split_rows_GMM(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        estimator = GaussianMixture(
            n_components=n_clusters,
            covariance_type=covariance_type,
            max_iter=max_iter,
            n_init=n_init,
            random_state=seed,
        )

        clusters = estimator.fit(data).predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
    def split_rows_KMeans(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        if data.shape[0] > max_sampling_threshold_rows:
            data_sample = data[np.random.randint(data.shape[0], size=max_sampling_threshold_rows), :]

            kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
            clusters = kmeans.fit(data_sample).predict(data)
        else:
            kmeans = KMeans(n_clusters=n_clusters, random_state=seed)
            clusters = kmeans.fit_predict(data)

        cluster_centers = kmeans.cluster_centers_
        result = split_data_by_clusters(local_data, clusters, scope, rows=True)

        return result, cluster_centers.tolist()
Exemplo n.º 9
0
    def split_rows_RuleClustering(
        local_data,
        ds_context,
        scope,
    ):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        # https://stackoverflow.com/a/39772170/5595684
        km = KMeans(k, random_state=rand_state)
        km_clusters = km.fit_predict(data)
        lab, count = np.unique(km.labels_, return_counts=True)
        # inverse weight classes, todo test if this works ok
        N = len(data)
        lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)}
        W = [lab_wgt[lab] for lab in km.labels_]

        if model == 'stump':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=1,
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'tree':
            dtc = DecisionTreeClassifier(random_state=rand_state,
                                         max_depth=None,
                                         ccp_alpha=0.05,
                                         min_impurity_split=0.01
                                         # max_leaf_nodes=2*10**(self.k+1)
                                         ).fit(data,
                                               km.labels_,
                                               sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'm-estimate':
            raise ValueError('Not implemented')
        else:
            raise ValueError(str(model) + ' unknown model type')

        # todo try out rule clusters
        right_rule = left_rule.negate()
        rule_clusters = (right_rule.apply(
            data, scope_partial_data=scope)).astype(int)
        split = split_data_by_clusters(data, rule_clusters, scope, rows=True)
        assert len(split) == 2
        return split, (left_rule, right_rule)
Exemplo n.º 10
0
    def split_rows_Gower(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, False)

        feature_types = []
        for s in scope:
            mt = ds_context.meta_types[s]
            if mt == MetaType.BINARY:
                feature_types.append("categorical")
            elif mt == MetaType.DISCRETE:
                feature_types.append("discrete")
            else:
                feature_types.append("continuous")

        try:
            df = robjects.r["as.data.frame"](data)
            clusters = robjects.r["mixedclustering"](df, feature_types,
                                                     n_clusters, seed)
            clusters = np.asarray(clusters)
        except Exception as e:
            np.savetxt("/tmp/errordata.txt", local_data)
            raise e

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 11
0
    def split_rows_DBScan(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        clusters = DBSCAN(eps=eps, min_samples=min_samples).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 12
0
    def split_cols_random_partitions(local_data, ds_context, scope):
        #same as above, but transpose the data
        data = preproc(local_data.T, ds_context, None, ohe)
        clusters = above(make_planes(1, data.shape[1], rand_gen), data)[:, 0]

        return split_data_by_clusters(local_data, clusters, scope, rows=False)
Exemplo n.º 13
0
    def split_rows_random_partitions(local_data, ds_context, scope):
        data = preproc(local_data, ds_context, None, ohe)
        clusters = above(make_planes(1, local_data.shape[1], rand_gen),
                         data)[:, 0]

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 14
0
    def split_rows_RuleClustering(
        local_data,
        ds_context,
        scope,
    ):
        data = preproc(local_data, ds_context, pre_proc, ohe)

        #https://stackoverflow.com/a/39772170/5595684
        km = KMeans(k, random_state=rand_state)
        km_clusters = km.fit_predict(data)
        lab, count = np.unique(km.labels_, return_counts=True)
        #inverse weight classes, todo test if this works ok
        N = len(data)
        lab_wgt = {lab: (N - count) / N for lab, count in zip(lab, count)}
        W = [lab_wgt[lab] for lab in km.labels_]

        if model == 'stump':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=1,
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'tree':
            dtc = DecisionTreeClassifier(
                random_state=rand_state,
                max_depth=None,
                ccp_alpha=0.05,
                min_impurity_split=0.01  #max_leaf_nodes=2*10**(self.k+1)
            ).fit(data, km.labels_, sample_weight=W)
            # dtc.cost_complexity_pruning_path()
            left_rule = tree_to_rule(dtc, scope, ds_context)
        elif model == 'm-estimate':
            raise ValueError('Not implemented')
        else:
            raise ValueError(str(model) + ' unknown model type')

        if debug:
            import matplotlib as plt  #todo remove when everythings working
            dt_labels = dtc.predict(data)
            # plot_tree(dtc)
            # plt.show()
            print(export_text(dtc.tree_.value))

            if data.shape[1] == 2:
                fig, ax = plt.subplots()
                colors = np.full(dt_labels.shape, 'blue', dtype=object)
                np.putmask(colors, dt_labels.astype(bool), 'green')
                colors[km.labels_ != dt_labels] = 'black'
                #plot rule:
                assert len(left_rule) <= 2
                for cond in left_rule:
                    if cond['feature'] == 0:
                        ax.axvline(cond['threshhold'], )
                    else:
                        ax.axhline(cond['threshhold'])
                ax.scatter(data[:, 0], data[:, 1], c=colors)
                plt.show()

        # todo try out rule clusters
        # rule_clusters = rule.apply(data)
        split = split_data_by_clusters(data, km_clusters, scope, rows=True)
        assert len(split) == 2
        right_rule = left_rule.negate()
        return split, (left_rule, right_rule)