def get_prediction(Y_test, theta, deltas): x_test = Y_test[:, :-1] y_test = Y_test[:, 1:] D0_test_x = missing.missing_var(x_test, deltas, make_positive=False) D0_test_y = missing.missing_var(y_test, deltas, make_positive=False) D1_test = missing.missing_covar(x_test, y_test, deltas, deltas) return (D0_test_y - 2 * np.matmul(theta, D1_test) + np.matmul(theta, np.matmul(D0_test_x, theta.T))).trace()
def get_sonic_theta(Y_train, deltas, num_clusters, lmbd): D0 = missing.missing_var(Y_train[:, :-1], deltas) D1 = missing.missing_covar(Y_train[:, :-1], Y_train[:, 1:], deltas, deltas) result = matrix_competition('ALTER', 5, num_clusters, D0, D1, lmbd, epochs=50) return result.theta
from common.alternating import matrix_competition if __name__ == '__main__': tasks = [ ('../data/users_daily_timeseries_AAPL.csv', "theta_daily_AAPL") , ('../data/users_BTC_timeseries_Daily.csv', "theta_daily_BTC") ] for task in tasks: data_path, save_path = task Y, deltas, names = read_data.read_stock_twits_user_sentiment(data_path, min_days=50, min_delta=0.5) N, tmax = np.shape(Y) D0 = missing.missing_var(Y[:, :-1], deltas) D1 = missing.missing_covar(Y[:, :-1], Y[:, 1:], deltas, deltas) # SET NUMBER OF CLUSTERS num_clusters = 2 lmbd = (np.linalg.eigvalsh(D0)[-(num_clusters + 1)]) * np.sqrt(np.log(N)) / np.sqrt(tmax * np.min(deltas) ** 2) print("lambda={}".format(lmbd)) res = matrix_competition('ALTER', 5, num_clusters, D0, D1, lmbd, epochs=50) theta_est, v_est, u_est, ind_est, loss = res.theta, res.v, res.u, res.index, res.loss lists = [[] for i in range(num_clusters)] for i in range(N): lists[res.index[i]].append(i)
for task in tasks: data_path = task Y, deltas, names = read_data.read_stock_twits_user_sentiment( data_path, min_days=50, min_delta=0.5) N, tmax = np.shape(Y) t_train = ceil(tmax * 0.7) Y_train = Y[:, :t_train] Y_test = Y[:, t_train - 1:] # SET NUMBER OF CLUSTERS num_clusters = 2 # theoretical lambda D0 = missing.missing_var(Y_train, deltas) lmbd = (np.linalg.eigvalsh(D0)[-(num_clusters + 1)]) * np.sqrt( np.log(N)) / np.sqrt(tmax * np.min(deltas)**2) ############################## # we compare four methods: # 1) SONIC # 2) VAR + lasso = SONIC with K = N # 3) VAR = SONIC with K = N and lambda = 0 # 4) theta = 0 (no causality) # methods = ["SONIC", "VAR+LASSO", "VAR", "ZERO"] thetas = { "SONIC": get_sonic_theta(Y_train, deltas, num_clusters, lmbd), "VAR+LASSO": get_sonic_theta(Y_train, deltas, N, lmbd),
def simu(n, c_num, s, T, pmin=1.0): # define true index c_size = int(n // c_num) r = n - c_num * c_size ind_star = np.array([ int(i // (c_size + 1)) if i < r * (c_size + 1) else int( (i - r * (c_size + 1)) // c_size) + r for i in range(n) ]) z_star = get_index_matrix(c_num, ind_star) #define true theta assert (s <= 5) v_star = np.zeros((c_num, n)) active_vals = [0.6, -0.4, 0.1, -0.8, 0.2] for j in range(c_num): v_star[j, j:j + s] = np.array(active_vals[:s]) * ((-1)**j) theta_star = np.matmul(z_star, v_star) coef = 0.5 / np.linalg.norm(theta_star, 2) v_star = v_star * coef theta_star = theta_star * coef #generate the time series x = gauss_var1_process(theta_star, 1., T) #include missing observations mask = np.random.binomial(1, pmin, size=(n, T)).astype(np.float64) x_missing = x * mask x_train = x_missing[:, :-1] y_train = x_missing[:, 1:] deltas = np.mean(mask, axis=1) D0 = missing.missing_var(x_train, deltas) D1 = missing.missing_covar(x_train, y_train, deltas, deltas) alphas = np.logspace(-3, 0, num=10, base=2) * 3 * math.sqrt( math.log(n) / (T * (pmin**2))) node_infls = [] cl_diffs = [] theta_diffs = [] ind_prev = ind_star for i, alpha_v in enumerate(alphas): res = matrix_competition('ALTER', 1, c_num, D0, D1, alpha_v, index_init=ind_star, epochs=20) theta_est, u_est, v_est, loss = res.theta, res.u, res.v, res.loss cl_diffs.append(index_dist(c_num, res.index, ind_star)) theta_diffs.append(np.linalg.norm(theta_est - theta_star, ord='fro')) node_infls.append(np.linalg.norm(theta_est, ord=2, axis=0)) ind_prev = res.index print("K={}: {}/10".format(c_num, i)) if False: sns.set() ax = sns.heatmap(theta_est, center=0) #ax.set_xticklabels(ax.get_xticklabels(), rotation=-90, fontsize=8) #ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) plt.show() sns.set() ax = sns.heatmap(theta_star, center=0) # ax.set_xticklabels(ax.get_xticklabels(), rotation=-90, fontsize=8) # ax.set_yticklabels(ax.get_yticklabels(), rotation=0, fontsize=8) plt.show() return alphas, np.array(theta_diffs), np.array(cl_diffs), np.array( node_infls)
def do(n, T, c_num, s, pmin): # set up theta_star c_size = int(n // c_num) r = n - c_num * c_size ind_star = np.array([ int(i // (c_size + 1)) if i < r * (c_size + 1) else int( (i - r * (c_size + 1)) // c_size) + r for i in range(n) ]) z_star = alternating.get_index_matrix(c_num, ind_star) # define true theta s = 1 assert (s <= 5) v_star = np.zeros((c_num, n)) active_vals = [0.6, -0.4, 0.1, -0.8, 0.2] for j in range(c_num): v_star[j, j:j + s] = np.array(active_vals[:s]) * ((-1)**j) theta_star = np.matmul(z_star, v_star) coef = 0.5 / np.linalg.norm(theta_star, 2) theta_star = theta_star * coef # generate the time series x = alternating.gauss_var1_process(theta_star, 1., T) # include missing observations mask = np.random.binomial(1, pmin, size=(n, T)).astype(np.float64) x_missing = x * mask # estimate missing probabilities deltas = np.mean(mask, axis=1) # number of windows and their length sim_num = 6 win_len = (3 * T) // 4 # candidates cluster numbers c_nums = list(range(2, min(20, n // 2) + 1)) ans_idx = [] # ans_theta = [] # ans_loss = [] for c in c_nums: ress = [] for (a, b) in [(k * ((T - win_len + 1) // sim_num), k * ((T - win_len + 1) // sim_num) + win_len) for k in range(sim_num)]: x_train = x_missing[:, a:b - 1] y_train = x_missing[:, a + 1:b] D0 = missing.missing_var(x_train, deltas) D1 = missing.missing_covar(x_train, y_train, deltas, deltas) lmbd = (np.linalg.eigvalsh(D0)[-(c + 1)]) * np.sqrt( np.log(n)) / np.sqrt(T * np.min(deltas)**2) c_size = int(n // c) r = n - c * c_size ind_init = np.array([ int(i // (c_size + 1)) if i < r * (c_size + 1) else int( (i - r * (c_size + 1)) // c_size) + r for i in range(n) ]) ress.append( alternating.matrix_competition("ALTER", 5, c, D0, D1, lmbd, epochs=20)) ans_idx.append([ alternating.index_dist(c, ress[0].index, ress[j].index) for j in range(1, sim_num) ]) # ans_theta.append([np.linalg.norm(ress[0].theta - ress[j].theta) for j in range(1, sim_num)]) # ans_loss.append(ress[0].loss) save_path = "results/simu_n{}t{}cnum{}pmin{}".format(n, T, c_num, pmin) res = np.zeros((sim_num - 1, len(c_nums))) for i, _ in enumerate(c_nums): res[:, i] = ans_idx[i] df = DataFrame(data=res, columns=c_nums) df.to_csv(save_path + ".csv")