def write_cv_data(K, data_dir, idx, W, L, T, D, N, phi, alpha, beta, chains): B = generate_lda(T, W, D, N, phi, alpha) # split cv data B_sparse = csr_matrix(B) Bs = [dok_matrix((D, W), dtype=np.float32) for k in range(K)] test_counts = [dok_matrix((D, W), dtype=np.float32) for k in range(K)] for d in range(B_sparse.shape[0]): crow = B_sparse[d,:].tocoo() list_of_tokens = [] for term_idx,count in itertools.izip(crow.col, crow.data): list_of_tokens += [term_idx]*count list_of_tokens = list(np.random.permutation(np.array(list_of_tokens))) kf = KFold(len(list_of_tokens), n_folds=K) for k,(train, test) in enumerate(kf): l = [list_of_tokens[i] for i in train] dict_of_counts = collections.Counter(l) for w,count in dict_of_counts.iteritems(): Bs[k][d,w] = count l = [list_of_tokens[i] for i in test] dict_of_counts = collections.Counter(l) for w,count in dict_of_counts.iteritems(): test_counts[k][d,w] = count Bs = [csr_matrix(i) for i in Bs] test_counts = [csr_matrix(i) for i in test_counts] for i,counts in enumerate(test_counts): pickle.dump(counts, open(os.path.join(data_dir, 'counts_{}.pkl'.format(i)), 'w')) write_pb_cv(data_dir, idx, W, T, D, alpha, beta, Bs, write_params=False) write_stan_cv(data_dir, idx, W, T, D, alpha, beta, Bs, chains=chains, write_params=False) write_prism_cv(data_dir, idx, W, T, D, alpha, beta, Bs, write_params=False) write_txt_cv(data_dir, idx, Bs, T, alpha, beta, write_params=False)
def write_data(data_dir, idx, W, L, T, D, N, phi, alpha, beta, chains): B = generate_lda(T, W, D, N, phi, alpha) pickle.dump(B, open(os.path.join(data_dir, 'counts.pkl'), 'w')) write_pb(data_dir, idx, W, T, D, alpha, beta, B) write_stan(data_dir, idx, W, T, D, alpha, beta, B, chains=chains) write_prism(data_dir, idx, W, T, D, alpha, beta, B) write_txt(data_dir, idx, B, T, alpha, beta)
def write_data(data_dir, idx, W, L, T, D, N, phi, alpha, beta, chains): B = generate_lda(T, W, D, N, phi, alpha) write_pb(data_dir, idx, W, T, D, alpha, beta, B, write_params=False) write_stan(data_dir, idx, W, T, D, alpha, beta, B, chains=chains, write_params=False) write_prism(data_dir, idx, W, T, D, alpha, beta, B, write_params=False) write_txt(data_dir, idx, B, T, alpha, beta, write_params=False)
def write_pb_church(data_dir, idx, W, L, T, D, N, phi, alpha, beta, n_samples, lag): B = generate_lda(T, W, D, N, phi, alpha) write_pb(data_dir, idx, W, T, D, alpha, beta, B) write_church(data_dir, idx, B, alpha,beta,D,T,W,N, n_samples, lag) write_church2(data_dir, idx, B, alpha,beta,D,T,W, n_samples, lag)