Exemplo n.º 1
0
    def test_utils(self):
        data = np.array([0, 1, 2, 3, 4, 5]).reshape(1, -1).repeat(10, axis=0)

        y, x = get_YX(data, 2)
        self.assertEqual(y.shape[0], 10)
        self.assertEqual(x.shape[0], 10)

        self.assertEqual(y.shape[1], 4)
        self.assertEqual(x.shape[1], 2)

        self.assertTrue(np.all(y[0, :] == [0, 1, 2, 3]))
        self.assertTrue(np.all(x[0, :] == [4, 5]))

        y, x = get_YX(data, 1)
        self.assertEqual(y.shape[0], 10)
        self.assertEqual(x.shape[0], 10)

        self.assertEqual(y.shape[1], 5)
        self.assertEqual(x.shape[1], 1)

        self.assertTrue(np.all(y[0, :] == [0, 1, 2, 3, 4]))
        self.assertTrue(np.all(x[0, :] == [5]))

        y, x = get_YX(data, 5)
        self.assertEqual(y.shape[0], 10)
        self.assertEqual(x.shape[0], 10)

        self.assertEqual(y.shape[1], 1)
        self.assertEqual(x.shape[1], 5)

        self.assertTrue(np.all(y[0, :] == [0]))
        self.assertTrue(np.all(x[0, :] == [1, 2, 3, 4, 5]))
Exemplo n.º 2
0
    def test_naive_factorization(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)
        result = naive_factorization(data=data2,
                                     parent=parent,
                                     pos=0,
                                     context=ctx,
                                     scope=list(scope))

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(parent.children[0], result[0][1]['parent'])

        y, x = get_YX(data, 4)

        self.assertEqual(len(result), len(scope))
        for i, s in enumerate(scope):
            r = result[i]
            self.assertEqual(len(r), 2)
            self.assertEqual(r[0], SplittingOperations.CREATE_LEAF_NODE)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(r[1]['scope'], [s])
            self.assertListEqual(r[1]['data'].tolist(),
                                 concatenate_yx(y[:, i], x).tolist())
Exemplo n.º 3
0
    def getCIGroups(local_data, ds_context=None, scope=None, families=None):
        """
        :param local_data: np array
        :param scope: a list of index to output variables
        :param alpha: threshold
        :param families: obsolete
        :return: np array of clustering

        This function take tuple (output, conditional) as input and returns independent groups
        alpha is the cutoff parameter for connected components
        BE CAREFUL WITH SPARSE DATA!
        """

        # data = preproc(local_data, ds_context, None, ohe)

        y, x = get_YX(local_data, ds_context.feature_size)

        pvals = testRcoT(y, x) + epsilon

        pvals[pvals > alpha] = 0

        clusters = np.zeros(y.shape[1])
        for i, c in enumerate(connected_components(from_numpy_matrix(pvals))):
            clusters[list(c)] = i + 1

        return split_conditional_data_by_clusters(y,
                                                  x,
                                                  clusters,
                                                  scope,
                                                  rows=False)
Exemplo n.º 4
0
def ExactMPE(spn,
             data,
             ds_context,
             node_top_down_mpe=_node_top_down_mpe,
             node_bottom_up_mpe_log=_node_bottom_up_mpe_log):
    y, x = get_YX(data, ds_context.feature_size)

    result = np.array(y)
    nodes = get_nodes_by_type(spn)

    lls_per_node = np.zeros((data.shape[0], len(nodes)))
    # one pass bottom up evaluating the likelihoods
    # for i in range(data.shape[0]):

    # result[i, :] = exact_mpe_row(spn, y[i, :], x[i, :])
    # log_likelihood(spn, data, dtype=data.dtype,lls_matrix=lls_per_node)
    a = likelihood(spn, data, dtype=data.dtype, lls_matrix=lls_per_node)
    # a = concatenate_yx(result,x)

    # instance_ids = np.arange(data.shape[0])
    #
    # # one pass top down to decide on the max branch until it reaches a leaf, then it fills the nan slot with the mode
    # eval_spn_top_down(spn,eval_functions=node_top_down_mpe,  parent_result=instance_ids, data=data, lls_per_node=lls_per_node)

    # return data

    return a
def naive_factorization(data=None,
                        node_id=0,
                        context=None,
                        scope=None,
                        **kwargs):
    assert scope is not None, "No scope"

    prod_node = Product()
    prod_node.scope = scope
    prod_node.id = node_id

    y, x = get_YX(data, context.feature_size)

    result = []
    for i, rv in enumerate(scope):
        prod_node.children.append(None)
        data_slice = concatenate_yx(y[:, i].reshape(-1, 1), x)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))

    return prod_node, result
Exemplo n.º 6
0
    def predict(self, data, y=None):
        # params = self.predict_params(data, y)

        if self.parametric_type == Gaussian:
            y, x = get_YX(data, feature_size=self.feature_size)
            # return params
            label = np.ones((len(y), 1))
            return self.coefficients.forward(x, smudge=0)[0]
Exemplo n.º 7
0
    def split_conditional_rows_KMeans(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, ohe)

        clusters = KMeans(n_clusters=n_clusters,
                          random_state=seed,
                          precompute_distances=True).fit_predict(data)

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
Exemplo n.º 8
0
    def test_create_conditional(self):

        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        K = int(data.shape[0] * 0.25)
        split_idx = np.array([0] * K + [1] * (data.shape[0] - K))
        np.random.shuffle(split_idx)

        y, x = get_YX(data, 4)

        def label_conditional(local_y, local_x):
            self.assertListEqual(local_y.tolist(), y.tolist())
            self.assertListEqual(local_x.tolist(), x.tolist())
            return split_idx

        result = create_conditional(data=data2,
                                    parent=parent,
                                    pos=0,
                                    context=ctx,
                                    scope=list(scope),
                                    label_conditional=label_conditional)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(result), 2)

        for i, r in enumerate(result):
            self.assertEqual(r[0], SplittingOperations.GET_NEXT_OP)
            self.assertIn('data', r[1])
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertEqual(r[1]['pos'], i)
            self.assertListEqual(scope, r[1]['scope'])
            self.assertEqual(r[1]['data'].shape[1], data.shape[1])

        conditional_node = result[0][1]['parent']

        child_idx = conditional_supervised_likelihood(
            conditional_node,
            [np.zeros((data.shape[0], 1)),
             np.ones((data.shape[0], 1))], data)

        self.assertListEqual(result[0][1]['data'].tolist(),
                             data[child_idx[:, 0] == 0, :].tolist())
        self.assertListEqual(result[1][1]['data'].tolist(),
                             data[child_idx[:, 0] == 1, :].tolist())
Exemplo n.º 9
0
    def predict_proba(self, data):
        y, X = get_YX(data, self.feature_size)

        params = self.predict_params(X, y)

        if self.parametric_type == Categorical:
            result = np.zeros((data.shape[0], 1))
            for j, c in enumerate(self.classes_):
                idx = y == c
                result[idx] = params[idx[:, 0], j]

            return result

        return self.proba_func(y, params)
def remove_non_informative_features(data=None,
                                    node_id=0,
                                    scope=None,
                                    context=0,
                                    uninformative_features_idx=None,
                                    **kwargs):
    assert uninformative_features_idx is not None, "parameter uninformative_features_idx can't be None"

    prod_node = Product()
    prod_node.scope = scope
    prod_node.id = node_id

    y, x = get_YX(data, context.feature_size)

    non_zero_variance_rvs = []
    non_zero_variance_idx = []
    result = []
    for idx, zero_var in enumerate(uninformative_features_idx):
        rv = scope[idx]

        if not zero_var:
            non_zero_variance_rvs.append(rv)
            non_zero_variance_idx.append(idx)
            continue

        prod_node.children.append(None)
        data_slice = concatenate_yx(y[:, idx].reshape(-1, 1), x)
        result.append((
            SplittingOperations.CREATE_LEAF_NODE,
            {
                "data": data_slice,
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": [rv],
            },
        ))
    assert len(result) > 0
    if len(non_zero_variance_idx) > 0:
        prod_node.children.append(None)
        result.append((
            SplittingOperations.GET_NEXT_OP,
            {
                "data": concatenate_yx(data[:, non_zero_variance_idx], x),
                "parent_id": prod_node.id,
                "pos": len(prod_node.children) - 1,
                "scope": non_zero_variance_rvs,
            },
        ))

    return prod_node, result
def create_conditional_slice(local_data, feature_size, scope,
                             label_conditional):
    cluster_labels = label_conditional(*get_YX(local_data, feature_size))

    unique_labels = np.unique(cluster_labels)
    if len(unique_labels) == 1:
        return None, None

    assert len(unique_labels) == 2

    features = get_X(local_data, feature_size)

    assert features.shape[0] == cluster_labels.shape[0]

    node = SupervisedOr(feature_size=feature_size)
    node.scope.extend(scope)
    # from sklearn.linear_model import LogisticRegression
    #
    # node.classifier = LogisticRegression(
    #     C=1,
    #     max_iter=10000,
    #     fit_intercept=True,
    #     tol=1e-15,
    #     class_weight="balanced",
    #     solver="lbfgs",
    # )
    #
    # # if local_data.shape[0] < 1000:
    # #    idx = np.random.randint(low=0, high=local_data.shape[0], size=1000)
    # #    node.classifier.fit(features[idx], cluster_labels[idx])
    # # else:
    # node.classifier.fit(features, cluster_labels)
    #
    # slice_idx = node.classifier.predict(features)
    #
    # if len(np.unique(slice_idx)) == 1:
    #     return None, None
    #
    # data_slices = []
    # idx = slice_idx == 0
    # data_slices.append((local_data[idx, :], scope, np.sum(idx) / len(slice_idx)))
    #
    # idx = slice_idx == 1
    # data_slices.append((local_data[idx, :], scope, np.sum(idx) / len(slice_idx)))

    return node, data_slices
def learn_cspn_structure(train_data,
                         ds_context,
                         op_lambdas=_conditional_op_lambdas,
                         split_rows=None,
                         split_cols=None,
                         create_leaf=None,
                         **kwargs):
    y, x = get_YX(train_data, ds_context.feature_size)
    return learn_structure(train_data,
                           ds_context,
                           compress=False,
                           scope=list(range(y.shape[1])),
                           op_lambdas=op_lambdas,
                           split_rows=split_rows,
                           split_cols=split_cols,
                           create_leaf=create_leaf,
                           **kwargs)
Exemplo n.º 13
0
def supervised_leaf_likelihood(node, data=None, dtype=np.float64):
    assert len(node.scope) == 1, node.scope

    y, x = get_YX(data, node.feature_size)
    y = y[:, node.scope]

    probs = np.ones((y.shape[0], 1), dtype=dtype)

    marg_ids = np.isnan(y[:, 0])

    if np.sum(~marg_ids) > 0:
        observations_data = concatenate_yx(y[~marg_ids], x[~marg_ids])

        probs[~marg_ids] = node.predictor.predict_proba(observations_data)

    probs[np.isclose(probs, 0)] = 0.000000001

    return probs
Exemplo n.º 14
0
    def test_remove_non_informative_features(self):
        np.random.seed(17)
        data = np.arange(0, 1000).reshape(-1, 8)
        data[:, 1] = 1
        data[:, 3] = 3

        parent = Sum()
        parent.children.append(None)

        ctx = Context()
        ctx.feature_size = 4

        scope = [1, 3, 4, 6]
        data2 = np.array(data)

        y, x = get_YX(data, 4)

        uninformative_features_idx = np.var(y, 0) == 0
        result = remove_non_informative_features(
            data=data2,
            parent=parent,
            pos=0,
            context=ctx,
            scope=list(scope),
            uninformative_features_idx=uninformative_features_idx)

        self.assertListEqual(data.tolist(), data2.tolist())

        self.assertEqual(len(parent.children[0].children), len(result))

        resulting_scopes = [[3], [6], [1, 4]]
        resulting_data_y = [y[:, 1], y[:, 3], y[:, [0, 2]]]

        for i, r in enumerate(result):
            self.assertEqual(len(r), 2)
            self.assertEqual(type(r[1]['parent']), Product)
            self.assertEqual(parent.children[0], r[1]['parent'])
            self.assertListEqual(r[1]['scope'], resulting_scopes[i])
            self.assertEqual(r[1]['pos'], i)

            self.assertListEqual(
                r[1]['data'].tolist(),
                concatenate_yx(resulting_data_y[i], x).tolist())
Exemplo n.º 15
0
def create_conditional_leaf(data, context, scope):
    assert len(
        scope
    ) == 1, "scope of univariate parametric for more than one variable?"

    feature_size = context.feature_size
    parametric_type = context.parametric_types[scope[0]]

    #
    predictor = CSPNLinearModel(parametric_type=parametric_type,
                                feature_size=feature_size)  ##modefy
    node = SupervisedLeaf(scope=scope,
                          predictor=predictor,
                          parametric_type=parametric_type,
                          feature_size=feature_size)
    # node = SupervisedLeaf(data=data,scope=scope, predictor=None, parametric_type=parametric_type, feature_size=feature_size)

    y, X = get_YX(data, node.feature_size)

    node.predictor.fit(X, y)

    return node
Exemplo n.º 16
0
    def split_rows_Gower(local_data, ds_context, scope):
        y, x = get_YX(local_data, ds_context.feature_size)
        data = preproc(y, ds_context, pre_proc, False)

        feature_types = []
        for s in scope:
            mt = ds_context.meta_types[s]
            if mt == MetaType.BINARY:
                feature_types.append("categorical")
            elif mt == MetaType.DISCRETE:
                feature_types.append("discrete")
            else:
                feature_types.append("continuous")

        try:
            df = robjects.r["as.data.frame"](data)
            clusters = robjects.r["mixedclustering"](df, feature_types,
                                                     n_clusters, seed)
            clusters = np.asarray(clusters)
        except Exception as e:
            np.savetxt("/tmp/errordata.txt", local_data)
            raise e

        return split_data_by_clusters(local_data, clusters, scope, rows=True)
def next_operation(
        data=None,
        parent_id=0,
        parent_type=None,
        pos=0,
        scope=None,
        no_clusters=False,
        no_splitting=False,
        no_independencies=True,
        # is_first=False,
        is_first=True,
        cluster_first=True,
        cluster_univariate=False,
        # cluster_univariate=True,
        min_features_slice=1,
        min_splitting_instances=500,
        min_clustering_instances=500,
        context=None,
        allow_sum_nodes=True,
        allow_conditioning_nodes=True,
        remove_uninformative_features=False,
        **kwargs):
    y, x = get_YX(data, context.feature_size)

    isMinimalFeatures = y.shape[1] <= min_features_slice
    isMinimalClusteringInstances = y.shape[0] <= min_clustering_instances
    isMinimalSplittingInstances = y.shape[0] <= min_splitting_instances

    result_params = {
        "data": data,
        "parent_id": parent_id,
        "pos": pos,
        "scope": scope,
        "no_clusters": no_clusters,
        "no_independencies": no_independencies,
    }

    uninformative_features = np.var(y, 0) == 0
    if remove_uninformative_features and np.any(uninformative_features):
        result_op = SplittingOperations.REMOVE_UNINFORMATIVE_FEATURES
        result_params["uninformative_features_idx"] = uninformative_features
        return None, [(result_op, result_params)]
    #
    # if not isMinimalSplittingInstances and not no_splitting:
    #     #split as much as you can
    #     result_op = SplittingOperations.CREATE_CONDITIONAL_NODE
    #     # result_op = SplittingOperations.CREATE_SUM_NODE
    #     return None, [(result_op, result_params)]

    if isMinimalFeatures:
        if isMinimalClusteringInstances or no_clusters:
            return None, [(SplittingOperations.CREATE_LEAF_NODE, result_params)
                          ]
        else:
            if cluster_univariate:
                return None, [(SplittingOperations.CREATE_SUM_NODE,
                               result_params)]
            else:
                return None, [(SplittingOperations.CREATE_LEAF_NODE,
                               result_params)]

    if isMinimalClusteringInstances or (no_clusters and no_independencies):
        return None, [(SplittingOperations.NAIVE_FACTORIZATION, result_params)]

    if no_independencies:
        return None, [(SplittingOperations.CREATE_SUM_NODE, result_params)]

    if no_clusters:
        return None, [(SplittingOperations.CREATE_PRODUCT_NODE, result_params)]

    if is_first:
        if cluster_first:
            return None, [(SplittingOperations.CREATE_SUM_NODE, result_params)]
        else:
            return None, [(SplittingOperations.CREATE_PRODUCT_NODE,
                           result_params)]

    return None, [(SplittingOperations.CREATE_PRODUCT_NODE, result_params)]