Пример #1
0
def get_prediction(Y_test, theta, deltas):
    x_test = Y_test[:, :-1]
    y_test = Y_test[:, 1:]

    D0_test_x = missing.missing_var(x_test, deltas, make_positive=False)
    D0_test_y = missing.missing_var(y_test, deltas, make_positive=False)
    D1_test = missing.missing_covar(x_test, y_test, deltas, deltas)

    return (D0_test_y - 2 * np.matmul(theta, D1_test) +
            np.matmul(theta, np.matmul(D0_test_x, theta.T))).trace()
Пример #2
0
def get_sonic_theta(Y_train, deltas, num_clusters, lmbd):
    D0 = missing.missing_var(Y_train[:, :-1], deltas)
    D1 = missing.missing_covar(Y_train[:, :-1], Y_train[:, 1:], deltas, deltas)

    result = matrix_competition('ALTER',
                                5,
                                num_clusters,
                                D0,
                                D1,
                                lmbd,
                                epochs=50)
    return result.theta
Пример #3
0
from common.alternating import matrix_competition

if __name__ == '__main__':
    tasks = [
        ('../data/users_daily_timeseries_AAPL.csv', "theta_daily_AAPL")
        , ('../data/users_BTC_timeseries_Daily.csv', "theta_daily_BTC")
    ]

    for task in tasks:
        data_path, save_path = task

        Y, deltas, names = read_data.read_stock_twits_user_sentiment(data_path, min_days=50, min_delta=0.5)
        N, tmax = np.shape(Y)

        D0 = missing.missing_var(Y[:, :-1], deltas)
        D1 = missing.missing_covar(Y[:, :-1], Y[:, 1:], deltas, deltas)

        # SET NUMBER OF CLUSTERS
        num_clusters = 2

        lmbd = (np.linalg.eigvalsh(D0)[-(num_clusters + 1)]) * np.sqrt(np.log(N)) / np.sqrt(tmax * np.min(deltas) ** 2)
        print("lambda={}".format(lmbd))

        res = matrix_competition('ALTER', 5, num_clusters, D0, D1, lmbd, epochs=50)
        theta_est, v_est, u_est, ind_est, loss = res.theta, res.v, res.u, res.index, res.loss

        lists = [[] for i in range(num_clusters)]
        for i in range(N):
            lists[res.index[i]].append(i)
Пример #4
0
    for task in tasks:
        data_path = task

        Y, deltas, names = read_data.read_stock_twits_user_sentiment(
            data_path, min_days=50, min_delta=0.5)
        N, tmax = np.shape(Y)

        t_train = ceil(tmax * 0.7)
        Y_train = Y[:, :t_train]
        Y_test = Y[:, t_train - 1:]

        # SET NUMBER OF CLUSTERS
        num_clusters = 2

        # theoretical lambda
        D0 = missing.missing_var(Y_train, deltas)
        lmbd = (np.linalg.eigvalsh(D0)[-(num_clusters + 1)]) * np.sqrt(
            np.log(N)) / np.sqrt(tmax * np.min(deltas)**2)

        ##############################
        # we compare four methods:
        # 1) SONIC
        # 2) VAR + lasso = SONIC with K = N
        # 3) VAR = SONIC with K = N and lambda = 0
        # 4) theta = 0 (no causality)
        #

        methods = ["SONIC", "VAR+LASSO", "VAR", "ZERO"]
        thetas = {
            "SONIC": get_sonic_theta(Y_train, deltas, num_clusters, lmbd),
            "VAR+LASSO": get_sonic_theta(Y_train, deltas, N, lmbd),
Пример #5
0
def simu(n, c_num, s, T, pmin=1.0):
    # define true index
    c_size = int(n // c_num)
    r = n - c_num * c_size
    ind_star = np.array([
        int(i // (c_size + 1)) if i < r * (c_size + 1) else int(
            (i - r * (c_size + 1)) // c_size) + r for i in range(n)
    ])
    z_star = get_index_matrix(c_num, ind_star)

    #define true theta
    assert (s <= 5)
    v_star = np.zeros((c_num, n))
    active_vals = [0.6, -0.4, 0.1, -0.8, 0.2]
    for j in range(c_num):
        v_star[j, j:j + s] = np.array(active_vals[:s]) * ((-1)**j)

    theta_star = np.matmul(z_star, v_star)
    coef = 0.5 / np.linalg.norm(theta_star, 2)

    v_star = v_star * coef
    theta_star = theta_star * coef

    #generate the time series
    x = gauss_var1_process(theta_star, 1., T)

    #include missing observations
    mask = np.random.binomial(1, pmin, size=(n, T)).astype(np.float64)
    x_missing = x * mask

    x_train = x_missing[:, :-1]
    y_train = x_missing[:, 1:]

    deltas = np.mean(mask, axis=1)

    D0 = missing.missing_var(x_train, deltas)
    D1 = missing.missing_covar(x_train, y_train, deltas, deltas)

    alphas = np.logspace(-3, 0, num=10, base=2) * 3 * math.sqrt(
        math.log(n) / (T * (pmin**2)))

    node_infls = []
    cl_diffs = []
    theta_diffs = []
    ind_prev = ind_star
    for i, alpha_v in enumerate(alphas):
        res = matrix_competition('ALTER',
                                 1,
                                 c_num,
                                 D0,
                                 D1,
                                 alpha_v,
                                 index_init=ind_star,
                                 epochs=20)
        theta_est, u_est, v_est, loss = res.theta, res.u, res.v, res.loss

        cl_diffs.append(index_dist(c_num, res.index, ind_star))
        theta_diffs.append(np.linalg.norm(theta_est - theta_star, ord='fro'))
        node_infls.append(np.linalg.norm(theta_est, ord=2, axis=0))

        ind_prev = res.index

        print("K={}: {}/10".format(c_num, i))

    if False:
        sns.set()
        ax = sns.heatmap(theta_est, center=0)
        #ax.set_xticklabels(ax.get_xticklabels(), rotation=-90, fontsize=8)
        #ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
        plt.show()

        sns.set()
        ax = sns.heatmap(theta_star, center=0)
        # ax.set_xticklabels(ax.get_xticklabels(), rotation=-90, fontsize=8)
        # ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8)
        plt.show()

    return alphas, np.array(theta_diffs), np.array(cl_diffs), np.array(
        node_infls)
Пример #6
0
def do(n, T, c_num, s, pmin):
    # set up theta_star
    c_size = int(n // c_num)
    r = n - c_num * c_size
    ind_star = np.array([
        int(i // (c_size + 1)) if i < r * (c_size + 1) else int(
            (i - r * (c_size + 1)) // c_size) + r for i in range(n)
    ])
    z_star = alternating.get_index_matrix(c_num, ind_star)

    # define true theta
    s = 1
    assert (s <= 5)
    v_star = np.zeros((c_num, n))
    active_vals = [0.6, -0.4, 0.1, -0.8, 0.2]
    for j in range(c_num):
        v_star[j, j:j + s] = np.array(active_vals[:s]) * ((-1)**j)

    theta_star = np.matmul(z_star, v_star)
    coef = 0.5 / np.linalg.norm(theta_star, 2)
    theta_star = theta_star * coef

    # generate the time series
    x = alternating.gauss_var1_process(theta_star, 1., T)

    # include missing observations
    mask = np.random.binomial(1, pmin, size=(n, T)).astype(np.float64)
    x_missing = x * mask

    # estimate missing probabilities
    deltas = np.mean(mask, axis=1)

    # number of windows and their length
    sim_num = 6
    win_len = (3 * T) // 4

    # candidates cluster numbers
    c_nums = list(range(2, min(20, n // 2) + 1))

    ans_idx = []
    # ans_theta = []
    # ans_loss = []
    for c in c_nums:
        ress = []
        for (a, b) in [(k * ((T - win_len + 1) // sim_num),
                        k * ((T - win_len + 1) // sim_num) + win_len)
                       for k in range(sim_num)]:
            x_train = x_missing[:, a:b - 1]
            y_train = x_missing[:, a + 1:b]

            D0 = missing.missing_var(x_train, deltas)
            D1 = missing.missing_covar(x_train, y_train, deltas, deltas)

            lmbd = (np.linalg.eigvalsh(D0)[-(c + 1)]) * np.sqrt(
                np.log(n)) / np.sqrt(T * np.min(deltas)**2)
            c_size = int(n // c)
            r = n - c * c_size
            ind_init = np.array([
                int(i // (c_size + 1)) if i < r * (c_size + 1) else int(
                    (i - r * (c_size + 1)) // c_size) + r for i in range(n)
            ])
            ress.append(
                alternating.matrix_competition("ALTER",
                                               5,
                                               c,
                                               D0,
                                               D1,
                                               lmbd,
                                               epochs=20))

        ans_idx.append([
            alternating.index_dist(c, ress[0].index, ress[j].index)
            for j in range(1, sim_num)
        ])
        # ans_theta.append([np.linalg.norm(ress[0].theta - ress[j].theta) for j in range(1, sim_num)])
        # ans_loss.append(ress[0].loss)

    save_path = "results/simu_n{}t{}cnum{}pmin{}".format(n, T, c_num, pmin)

    res = np.zeros((sim_num - 1, len(c_nums)))
    for i, _ in enumerate(c_nums):
        res[:, i] = ans_idx[i]
    df = DataFrame(data=res, columns=c_nums)
    df.to_csv(save_path + ".csv")