示例#1
0
    def predict(self, X, DistStr):
        N, D = X.shape

        if DistStr == "Gauss":
            P_hat = np.zeros((N, len(self.K)))

            for k, l in self.likelihoods.items():
                P_hat[:, k] = mvn.logpdf(X, l["mean"], l["cov"]) + np.log(
                    self.priors[k])

            return P_hat.argmax(axis=1)

        if DistStr == "Multinomial":
            P_hat = np.zeros((N, len(self.K)))

            for k, l in self.likelihoods.items():
                P_hat[:, k] = mlvn.logpmf(X, l["N"], l["P"]) + np.log(
                    self.priors[k])

            return P_hat.argmax(axis=1)

        if DistStr == "Bernoulli":
            P_hat = np.zeros((N, len(self.K)))

            for k, l in self.likelihoods.items():
                #Using the Bernoulli function/formula. Trick is to get the matrices to go from mxn to a 1x1 number for each k value
                P_hat[:, k] = np.log(self.priors[k]) + np.matmul(
                    X, np.log(l["mean"])) + np.matmul(
                        (1 - X), np.log(abs(1 - l["mean"])))

            return P_hat.argmax(axis=1)
示例#2
0
def AssignClustersSingleTopic(M, omega, X):
    """
    Assign a corpus of documents X to the most likely topic, 
    via the MAP assignment of Remark 2.1
    @param M: the conditional expectations matrix
    @param omega: the mixing weights
    @param X: a bag-of-words documents distributed
        as a Single Topic Model, with N rows an n columns;
        at position (i,j) we have the number of times the word j appeared in doc. i,
    """
    # Guarantees that the columns of M are in the simplex
    M = M / np.sign(M.sum(0))
    M[M <= 0] = 0.000001
    M[M >= 1] = 0.999999
    M = M / M.sum(0)
    # Guarantees that the omega is in the simplex
    omega = projsplx(omega)
    N, n = X.shape
    n, k = M.shape
    wmu = np.zeros((N, k))
    nn = X.sum(1)

    #Calculates the probability that a given sample has been generated by a given topic
    for i in range(k):
        mu = M[:, i].reshape(n)
        wmu[:, i] = multinomial.logpmf(X, n=nn, p=mu) + np.log(omega[i])

    #Perform MAP assignment
    CL = np.argmax(wmu, 1)
    return CL
def calc_independent_loglikelihood_var_disc(variable):
    x = train_df.groupby([variable]).size()
    n = len(train_df[variable])
    p = x / n
    loglike_array = multinomial.logpmf(x.tolist(), n, p.tolist())
    loglikelihood = np.sum(loglike_array)
    return loglikelihood
def run(ARGS, data=None, model=None, is_test=False):

    data = data or get_classification_data(ARGS.dataset, split=ARGS.split)
    model = model or get_classification_model(ARGS.model)(
        data.K, is_test=is_test, seed=ARGS.seed)

    def onehot(Y, K):
        return np.eye(K)[Y.flatten().astype(int)].reshape(Y.shape[:-1] + (K, ))

    Y_oh = onehot(data.Y_test, data.K)[None, :, :]  # 1, N_test, K

    model.fit(data.X_train, data.Y_train)
    p = model.predict(data.X_test)  # N_test, K

    # clip very large and small probs
    eps = 1e-12
    p = np.clip(p, eps, 1 - eps)
    p = p / np.expand_dims(np.sum(p, -1), -1)

    # evaluation metrics
    res = {}

    logp = multinomial.logpmf(Y_oh, n=1, p=p)

    res['test_loglik'] = np.average(logp)

    pred = np.argmax(p, axis=-1)

    res['test_acc'] = np.average(
        np.array(pred == data.Y_test.flatten()).astype(float))

    res['Y_test'] = data.Y_test
    res['p_test'] = p

    res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        with Database(ARGS.database_path) as db:
            db.write('classification', res)

    return res
def run(ARGS, is_test=False):
    data = get_classification_data(ARGS.dataset, split=ARGS.split, prop=1.)

    ind = np.zeros(data.X_train.shape[0]).astype(bool)
    ind[:ARGS.num_initial_points] = True

    X, Y = data.X_train, data.Y_train

    def onehot(Y, K):
        return np.eye(K)[Y.flatten().astype(int)].reshape(Y.shape[:-1] + (K, ))

    Y_oh = onehot(Y, data.K)

    Model = get_classification_model(ARGS.model)
    model = Model(data.K, is_test=is_test, seed=ARGS.seed)

    test_ll = []
    train_ll = []
    all_ll = []
    test_acc = []
    train_acc = []
    all_acc = []

    for _ in range(min(ARGS.iterations, X.shape[0] - ARGS.num_initial_points)):
        model.fit(X[ind], Y[ind])

        p = model.predict(X)  # NK
        # clip very large and small probs
        eps = 1e-12
        p = np.clip(p, eps, 1 - eps)
        p = p / np.expand_dims(np.sum(p, -1), -1)

        # entropy of predictions at all points
        ent = multinomial.entropy(n=1, p=p)

        # set the seen ones to -inf so we don't choose them
        ent[ind] = -np.inf

        # choose the highest entropy point to see next
        i = np.argmax(ent)
        ind[i] = True

        logp = multinomial.logpmf(Y_oh, n=1, p=p)  # N
        is_correct = (np.argmax(p, 1) == Y.flatten())  # N

        test_ll.append(np.average(logp[np.invert(ind)]))
        train_ll.append(np.average(logp[ind]))
        all_ll.append(np.average(logp))
        test_acc.append(np.average(is_correct[np.invert(ind)]))
        train_acc.append(np.average(is_correct[ind]))
        all_acc.append(np.average(is_correct))

    res = {
        'test_loglik': np.array(test_ll),
        'train_loglik': np.array(train_ll),
        'total_loglik': np.array(all_ll),
        'test_acc': np.array(test_acc),
        'train_acc': np.array(train_acc),
        'total_acc': np.array(all_acc),
    }
    res.update(ARGS.__dict__)

    if not is_test:  # pragma: no cover
        with Database(ARGS.database_path) as db:
            db.write('active_learning_discrete', res)
def calc_cond_loglikelihood(variables):
    y = variables[0]
    parents = variables[1:]
    parents_d = []
    parents_c = []
    loglikelihood = 0

    # Create parent sets partitioned by continuous and discrete variables
    for parent in parents:
        if parent in [
                'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsiv',
                'SS'
        ]:
            parents_c.append(parent)
        else:
            parents_d.append(parent)

    # Check if y is continuous and discrete variables
    if y in [
            'Nscore', 'Escore', 'Oscore', 'Ascore', 'Cscore', 'Impulsiv', 'SS'
    ]:
        y_continuous = True
    else:
        y_continuous = False

    # if all variables are discrete
    if (len(parents_c) == 0) and (y_continuous == False):
        X = train_df.groupby(variables).size()
        N = len(train_df)
        P = X / N
        loglike_array = multinomial.logpmf(X.tolist(), N, P.tolist())
        loglikelihood = np.sum(loglike_array)

    # if all variables are continuous
    elif len(parents_d) == 0 and (y_continuous == True):
        X = train_df[parents_c + [y]]
        n, k = X.shape
        # Parameter estimation with MLE for each parent_c
        #mu_vec = []
        sigma_vec = []
        for var in X.columns:
            mean, variance = norm.fit(X[var])  #MLE
            #mu_vec.append(mean)
            sigma_vec.append(variance)
        sigma_array = np.array(sigma_vec)
        # Calculate Likelihood with Formula
        loglike_c = -(n / 2) * (np.log(abs(sigma_array)) +
                                k * np.log(2 * math.pi) + 1)
        loglikelihood = loglike_c.sum()

    # else: mixed case
    else:
        # Partitioning
        if y_continuous:
            X = train_df.set_index(parents_d)[parents_c + [y]]
        else:
            X = train_df.set_index(parents_d + [y])[parents_c]
        pi_i = X.index.unique().tolist()
        # Iterate over partitions
        for p in pi_i:
            # Create design matrix for partition p
            X = X.sort_index()
            X_p = X.loc[p]
            n, k = X_p.shape
            # Parameter estimation with MLE for each parent_c
            sigma_vec = []
            for var in X_p.columns:
                mean, variance = norm.fit(X_p[var])  #MLE
                # if variance is 0 (because not enough cases), then estimation from train_df
                if variance == 0:
                    mean, variance = norm.fit(train_df[var])
                sigma_vec.append(variance)
            sigma_array = np.array(sigma_vec)
            # Calculate Likelihood with Formula from Andrews et al.
            loglike_c = -(n / 2) * (np.log(abs(sigma_array)) +
                                    k * np.log(2 * math.pi) + 1)
            logprob_d = n * np.log(n / len(train_df))
            loglikelihood += loglike_c + logprob_d

    # Calculate likelihood of parents
    loglike_parents = 0
    for parent in parents:
        if parent in parents_c:
            loglike_parents += calc_independent_loglikelihood_var_cont(parent)
        else:
            loglike_parents += calc_independent_loglikelihood_var_disc(parent)
    loglikelihood = loglikelihood.sum() - loglike_parents.sum()

    return loglikelihood