def construct_from_svd(U, s, V, cfg): T = cfg['T'] Phi = np.zeros((U.shape[0], T)) Theta = np.zeros((T, V.shape[1])) for i in xrange(T): x = U[:, i] y = V[i, :] xp = np.copy(x) xp[xp < 0] = 0 xn = (-1)*np.copy(x) xn[xn < 0] = 0 yp = np.copy(y) yp[yp < 0] = 0 yn = (-1)*np.copy(y) yn[yn < 0] = 0 xp_norm = np.linalg.norm(xp, ord=1) yp_norm = np.linalg.norm(yp, ord=1) xn_norm = np.linalg.norm(xn, ord=1) yn_norm = np.linalg.norm(yn, ord=1) if xp_norm*yp_norm > xn_norm*yn_norm: Phi[:, i] = np.sqrt(s[i]*xp_norm*yp_norm)*xp/xp_norm Theta[i, :] = np.sqrt(s[i]*xp_norm*yp_norm)*yp/yp_norm else: Phi[:, i] = np.sqrt(s[i]*xn_norm*yn_norm)*xn/xn_norm Theta[i, :] = np.sqrt(s[i]*xn_norm*yn_norm)*yn/yn_norm return normalize_cols(Phi), normalize_cols(Theta)
def plsa3D(V, W, H, post='', cfg=config.default_config()): #print('Probabilistic Latent Semantic Analysis.') eps = cfg['eps'] (N, M) = V.shape T = H.shape[0] V3 = V.reshape(N, M, 1).repeat(T, 2).swapaxes(1, 2) W3 = W.reshape(N, T, 1).repeat(M, 2) H3 = H.T.reshape(M, T, 1).repeat(N, 2).swapaxes(0, 2) Q3 = dot(W, H).reshape(N, M, 1).repeat(T, 2).swapaxes(1, 2) Z = V3 * W3 * H3 / (Q3 + eps) W = normalize_cols(sum(Z, 2).reshape(N, T)) H = normalize_cols(sum(Z, 0).reshape(T, M)) return W, H
def gen_matrix_topic(params): N, T = params['rows'], params['cols'] phi = np.zeros((N, T)) sparse = params['sparsity'] # sparsness (the main parameter) if sparse < params['eps']: sparse = params['eps'] elif sparse > 1: sparse = 1 nkernel = params['nkernel'] # number of average kernel words in topic nnoise = params['nnoise'] # number of noise (smooth) topics ntopic = T - nnoise kernel = np.maximum(1, np.random.binomial(N, min(1, nkernel / (N*sparse)), ntopic)) s = 0 for i in range(ntopic): phi[s:s+kernel[i], i] = -np.sort(-np.random.exponential(0.5, kernel[i])) s = s + int(kernel[i] * sparse) if i < ntopic-1 and s + kernel[i+1] > N: kernel[i+1] = max(1, N - s) s = N - kernel[i+1] if N-s-kernel[-1]+1 > 0: if nnoise == 0: phi[s+kernel[-1]-1:, :] = np.random.random_sample((N-s-kernel[-1]+1, T)) else: #phi[s+kernel[-1]-1:, ntopic:] = np.random.random_sample((N-s-kernel[-1]+1, nnoise)) phi[:, ntopic:] = np.random.random_sample((N, nnoise)) return normalize_cols(phi)
def gen_matrix_topic(params): N, T = params['rows'], params['cols'] phi = np.zeros((N, T)) sparse = params['sparsity'] # sparsness (the main parameter) if sparse < params['eps']: sparse = params['eps'] elif sparse > 1: sparse = 1 nkernel = params['nkernel'] # number of average kernel words in topic nnoise = params['nnoise'] # number of noise (smooth) topics ntopic = T - nnoise kernel = np.maximum( 1, np.random.binomial(N, min(1, nkernel / (N * sparse)), ntopic)) s = 0 for i in range(ntopic): phi[s:s + kernel[i], i] = -np.sort(-np.random.exponential(0.5, kernel[i])) s = s + int(kernel[i] * sparse) if i < ntopic - 1 and s + kernel[i + 1] > N: kernel[i + 1] = max(1, N - s) s = N - kernel[i + 1] if N - s - kernel[-1] + 1 > 0: if nnoise == 0: phi[s + kernel[-1] - 1:, :] = np.random.random_sample( (N - s - kernel[-1] + 1, T)) else: #phi[s+kernel[-1]-1:, ntopic:] = np.random.random_sample((N-s-kernel[-1]+1, nnoise)) phi[:, ntopic:] = np.random.random_sample((N, nnoise)) return normalize_cols(phi)
def gen_matrix_sparse(params): rows = params['rows'] cols = params['cols'] sparsity = params['sparsity'] M = np.zeros((rows, cols), dtype='float32') for i in xrange(cols): M[:, i] = np.random.dirichlet([sparsity]*rows) return normalize_cols(M)
def grad_desc(V, W, H, post='', cfg=config.default_config()): alpha = cfg[post + '_alpha'] step = cfg[post + '_alpha_step'] eps = cfg['eps'] #print('Gradient Descent with alpha={alpha}.'.format(alpha=alpha)) grad_W = dot((V - dot(W, H)), H.T) grad_H = dot(W.T, (V - dot(W, H))) #grad_W[grad_W < eps] = 0 #grad_H[grad_H < eps] = 0 W = W + alpha * grad_W W[(grad_W < eps) & (W < eps)] = 0 W = normalize_cols(W) H = H + alpha * grad_H H[(grad_H < eps) & (H < eps)] = 0 H = normalize_cols(H) alpha = alpha * step cfg[post + '_alpha'] = alpha return (W, H)
def gen_matrix_sparse(params): rows = params['rows'] cols = params['cols'] sparsity = params['sparsity'] M = np.zeros((rows, cols), dtype='float32') sz = int(rows * cols * (1 - sparsity)) idx0_t = [i for i in range(rows) for j in range(cols)] np.random.shuffle(idx0_t) idx0 = idx0_t[:sz] idx1_t = [j for i in range(rows) for j in range(cols)] np.random.shuffle(idx1_t) idx1 = idx1_t[:sz] M[idx0, idx1] = np.random.sample(sz) if rows < cols: M[:, :rows] = M[:, :rows] + np.eye(rows) * params['eps'] M[0, rows:] = params['eps'] else: M[:cols, :] = M[:cols, :] + np.eye(cols) * params['eps'] return normalize_cols(M)
def gen_matrix_normal(params): return normalize_cols(abs(np.random.randn(params['rows'], params['cols'])))
def gen_matrix_uniform(params): return normalize_cols( np.random.uniform(size=(params['rows'], params['cols'])))
def run(V, W, H, W_r=None, H_r=None, cfg=config.default_config()): T = H.shape[0] eps = cfg['eps'] schedule = cfg['schedule'].split(',') meas = cfg['measure'].split(',') val = np.zeros((cfg['max_iter'] + 2, len(meas))) hdist = np.zeros((cfg['max_iter'] + 2, 1)) for i, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[0, i] = fun(V, np.dot(W, H)) if cfg['compare_real']: #m = Munkres() idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[0] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T if cfg['print_lvl'] > 1: print('Initial loss:', val[0]) status = 0 methods_num = len(schedule) it = -1 for it in range(cfg['max_iter']): if cfg['print_lvl'] > 1: print('Iteration', it + 1) W_old = deepcopy(W) H_old = deepcopy(H) method_name = schedule[it % methods_num] if cfg['print_lvl'] > 1: print('Method:', method_name) method = getattr(methods, method_name) (W, H) = method(V, W, H, method_name, cfg) if (it + 1) % cfg['normalize_iter'] == 0: W = normalize_cols(W) H = normalize_cols(H) for j, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[it + 1, j] = fun(V, np.dot(W, H)) if cfg['compare_real']: idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[it + 1] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T if cfg['print_lvl'] > 1: print(val[it + 1]) if all(val[it, :] < eps): if cfg['print_lvl'] > 1: print('By cost.') status = 1 break if abs(W_old - W).max() < eps and abs(H_old - H).max() < eps: if cfg['print_lvl'] > 1: print('By argument.') status = 2 break #del W_old #del H_old if cfg['print_lvl'] > 1: print('Final:') W = normalize_cols(W) H = normalize_cols(H) for j, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[it + 2:, j] = fun(V, np.dot(W, H)) if cfg['compare_real']: idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[it + 2:] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T return (val, hdist, it, W, H, status)
def main(config_file='config.txt', results_file='results.txt', cfg=None): if cfg == None: cfg = config.load(config_file) if cfg['seed'] >= 0: np.random.seed(cfg['seed']) else: np.random.seed(None) eps = cfg['eps'] N = cfg['N'] T = cfg['T'] M = cfg['M'] vocab = None W_r = None H_r = None if cfg['run_info'] == 'results' or cfg['run_info'] == 1: cfg['print_lvl'] = 1 elif cfg['run_info'] == 'run' or cfg['run_info'] == 2: cfg['print_lvl'] = 2 else: cfg['print_lvl'] = 0 if cfg['print_lvl'] > 0: print('Generating...') if cfg['load_data'] == 'uci' or cfg['load_data'] == 2: V, vocab = load_uci(cfg['data_name'], cfg) V = normalize_cols(V) N, M = V.shape cfg['N'], cfg['M'] = V.shape print('Size:', N, M) elif cfg['load_data'] == 'csv' or cfg['load_data'] == 1: _, W_r, H_r = load_csv(cfg['gen_name'], cfg) #plt.matshow(1-W_r, cmap=plt.cm.gray) #plt.title('real') V, vocab = load_uci(cfg['gen_name'], cfg) V = normalize_cols(V) N, M = V.shape cfg['N'], cfg['M'] = V.shape print('Size:', N, M) cfg['T_0'] = W_r.shape[1] else: V, W_r, H_r = gen_real(cfg) print('Checking assumption on V:', np.sum(V, axis=0).max()) #tp = '0_5_100_16_500' #V_filename = 'datasets/V.' + tp + '.txt.csv' #W_filename = 'datasets/W.' + tp + '.txt.csv' #H_filename = 'datasets/H.' + tp + '.txt.csv' #V = np.loadtxt(V_filename, delimiter=',') #W_r = np.loadtxt(W_filename, delimiter=',') #H_r = np.loadtxt(H_filename, delimiter=',') #show_matrices(W_r, H_r) #plt.savefig('tm_tests/real' + tp + '.eps', format='eps') res = [0] * cfg['runs'] finals = [0] * cfg['runs'] hdist_runs = [0] * cfg['runs'] exp_time = [0] * cfg['runs'] meas = cfg['measure'].split(',') meas_name = [''] * len(meas) for i, f_name in enumerate(meas): f = getattr(measure, f_name + '_name') meas_name[i] = f() print('Measures:', meas_name) if cfg['compare_methods']: methods = cfg['schedule'].split(',') nmethods = len(methods) for r in range(cfg['runs']): if cfg['print_lvl'] > 0: print('Run', r + 1) #(W, H) = gen_init(cfg) if cfg['print_lvl'] > 0: print(' Starting...') labels = None st = time() if r >= cfg['prepare'] and cfg['prepare'] >= 0 and cfg[ 'prepare_method'] > 0: print('Preparing data...') if cfg['prepare_method'] == 1: W = anchor_words(V, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif cfg['prepare_method'] == 2: centroids, labels = reduce_cluster(V.T, cfg['T'], cfg) W = centroids.T W[W < eps] = 0 W = normalize_cols(W) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif cfg['prepare_method'] == 3: centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif cfg['prepare_method'] >= 4 and cfg['prepare_method'] <= 6: if cfg['prepare_method'] == 4: red = reduce_tsne(V, to_dim=4) elif cfg['prepare_method'] == 5: red = reduce_tsne(V, to_dim=3) elif cfg['prepare_method'] == 6: red = reduce_tsne(V, to_dim=2) centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) nearest_words = find_nearest(red, centroids, labels) V_reduced = normalize_cols(V[nearest_words, :]) W = anchor_words(V_reduced, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif cfg['prepare_method'] == 10: centroids, labels = reduce_multi_cluster( V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) #W = restore_multi_cluster(W, labels, cfg) W = linalg.solve( dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T W[W < eps] = 0 W = normalize_cols(W) else: (W, H) = gen_init(cfg) #cur_frob = measure.frobenius(V, np.dot(W, H)) #for init_it in xrange(200): # W_new, H_new = gen_init(cfg) # if measure.frobenius(V, np.dot(W_new, H_new)) #< cur_frob: # W = deepcopy(W_new) # H = deepcopy(H_new) se = time() - st print('Preparing took time:', timedelta(seconds=se)) #labels=None #print('Preparing data...') #centroids, labels = reduce_cluster(V, cfg['T'], cfg) #H = centroids #H[H < eps] = 0 #H = normalize_cols(H) #print('Solving for W') #W = linalg.solve(dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T #W[W < eps] = 0 #W = normalize_cols(W) #centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) #W = anchor_words(centroids, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) #H[H < eps] = 0 #H = normalize_cols(H) #W = restore_cluster(W, labels, cfg) #W = anchor_words(V, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) #H[H < eps] = 0 #H = normalize_cols(H) #red = reduce_tsne(V, to_dim=3) #centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) #print('c', centroids.shape, 'l', labels.shape) #nearest_words = find_nearest(red, centroids, labels) #print('nw:', nearest_words.shape) #V_reduced = V[nearest_words, :] #print('Vr', V_reduced.shape) #W = anchor_words(V_reduced, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) #H[H < eps] = 0 #H = normalize_cols(H) #W = restore_cluster(W, labels, cfg) if cfg['compare_prepare'] > 0: if r > 0: print('Preparing data...') if r == 1: W = anchor_words(V, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif r == 2: centroids, labels = reduce_cluster(V, cfg['T'], cfg) H = centroids H[H < eps] = 0 H = normalize_cols(H) print('Solving for W') W = linalg.solve( dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T W[W < eps] = 0 W = normalize_cols(W) elif r == 3: centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif r >= 4 and r <= 6: if r == 4: red = reduce_tsne(V, to_dim=4) elif r == 5: red = reduce_tsne(V, to_dim=3) elif r == 6: red = reduce_tsne(V, to_dim=2) centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) nearest_words = find_nearest(red, centroids, labels) V_reduced = V[nearest_words, :] W = anchor_words(V_reduced, 'L2', cfg) print('Solving for H') H = linalg.solve( np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) if cfg['compare_methods'] > 0: cfg['schedule'] = methods[r % nmethods] start = time() (val, hdist, it, W, H, status) = run(V, W, H, W_r, H_r, cfg) stop = time() print('Run time:', timedelta(seconds=stop - start)) exp_time[r] = stop - start res[r] = val hdist_runs[r] = hdist if cfg['print_lvl'] > 0: print(' Result:', val[-1, :]) for i, fun_name in enumerate(cfg['finals'].split(',')): #val = np.array([r[:, i] for r in res]) fun = getattr(measure, fun_name) name, val = fun(W, H) print(name, ':', val) print(cfg['experiment']) if cfg['experiment'] == '': exp_name = 'test' else: exp_name = cfg['experiment'] #if cfg['save_results']: # if cfg['save_file']: # results_file = cfg['save_file'] # with open(results_file, 'w') as rf: # print_head(rf) # print('# Generated on {}'.format(datetime.today()), file=rf) # print('# Experiments config:', file=rf) # print('# Number of experiments: {}'.format(cfg['runs']), file=rf) # print('# Methods schedule: {}'.format(cfg['schedule']), file=rf) # print('# Iterations number: {}'.format(cfg['max_iter']), file=rf) # print('# All experiments done in {}'.format( # timedelta(seconds=sum(exp_time))), file=rf) # print_head(rf) # for r in range(cfg['runs']): # print('# Run #{}. Done in {}'.format(r+1, # timedelta(seconds=exp_time[r])), file=rf) # [print(val, file=rf) for val in res[:, r]] # print_head(rf) if cfg['show_results']: if not os.path.exists(cfg['result_dir']): os.makedirs(cfg['result_dir']) np.savetxt(join(cfg['result_dir'], cfg['experiment'] + '_W.csv'), W) #show_topics(W, 25, vocab=vocab) save_topics(W, join(cfg['result_dir'], cfg['experiment'] + '_topics.txt'), vocab) #plot_matrix(V, 'Documents', labels=labels, vocab=vocab) #filename = os.path.join(cfg['result_dir'], cfg['experiment']+'_V.eps') #plt.savefig(filename, format='eps') #plot_matrix(W, u'Распределение слов в темах', labels, vocab) #filename = os.path.join(cfg['result_dir'], cfg['experiment']+'_W.pdf') #plt.savefig(filename, format='pdf') for i, fun_name in enumerate(cfg['measure'].split(',')): val = np.array([r[:, i] for r in res]) fun = getattr(measure, fun_name + '_name') plot_measure(val.T, fun()) filename = os.path.join( cfg['result_dir'], cfg['experiment'] + '_' + fun_name + '.pdf') plt.savefig(filename, format='pdf') if cfg['compare_real']: print('Hellinger res:', hdist_runs[0][-1, 0]) plot_measure( np.array([r[:, 0] for r in hdist_runs]).T, measure.hellinger_name()) show_matrices_recovered(W_r, H_r, W, H, cfg, permute=True) #plt.savefig('tm_tests/recovered_cnmf_' + tp + '.eps', format='eps') #plt.show() return res
def main(config_file='config.txt', results_file='results.txt', cfg=None): if cfg == None: cfg = config.load(config_file) if cfg['seed'] >= 0: np.random.seed(cfg['seed']) else: np.random.seed(None) eps = cfg['eps'] N = cfg['N'] T = cfg['T'] M = cfg['M'] vocab = None W_r = None H_r = None if cfg['run_info'] == 'results' or cfg['run_info'] == 1: cfg['print_lvl'] = 1 elif cfg['run_info'] == 'run' or cfg['run_info'] == 2: cfg['print_lvl'] = 2 else: cfg['print_lvl'] = 0 if cfg['print_lvl'] > 0: print('Generating...') if cfg['load_data'] == 'uci' or cfg['load_data'] == 2: V, vocab = load_uci(cfg['data_name'], cfg) V = normalize_cols(V) N, M = V.shape cfg['N'], cfg['M'] = V.shape print('Size:', N, M) elif cfg['load_data'] == 'csv' or cfg['load_data'] == 1: _, W_r, H_r = load_csv(cfg['gen_name'], cfg) #plt.matshow(1-W_r, cmap=plt.cm.gray) #plt.title('real') V, vocab = load_uci(cfg['gen_name'], cfg) V = normalize_cols(V) N, M = V.shape cfg['N'], cfg['M'] = V.shape print('Size:', N, M) cfg['T_0'] = W_r.shape[1] else: V, W_r, H_r = gen_real(cfg) print('Checking assumption on V:', np.sum(V, axis=0).max()) #tp = '0_5_100_16_500' #V_filename = 'datasets/V.' + tp + '.txt.csv' #W_filename = 'datasets/W.' + tp + '.txt.csv' #H_filename = 'datasets/H.' + tp + '.txt.csv' #V = np.loadtxt(V_filename, delimiter=',') #W_r = np.loadtxt(W_filename, delimiter=',') #H_r = np.loadtxt(H_filename, delimiter=',') #show_matrices(W_r, H_r) #plt.savefig('tm_tests/real' + tp + '.eps', format='eps') res = [0] * cfg['runs'] finals = [0] * cfg['runs'] hdist_runs = [0] * cfg['runs'] exp_time = [0] * cfg['runs'] meas = cfg['measure'].split(',') meas_name = [''] * len(meas) for i, f_name in enumerate(meas): f = getattr(measure, f_name + '_name') meas_name[i] = f() print('Measures:', meas_name) if cfg['compare_methods']: methods = cfg['schedule'].split(',') nmethods = len(methods) for r in range(cfg['runs']): if cfg['print_lvl'] > 0: print('Run', r+1) #(W, H) = gen_init(cfg) if cfg['print_lvl'] > 0: print(' Starting...') labels = None st = time() if r >= cfg['prepare'] and cfg['prepare'] >= 0 and cfg['prepare_method'] > 0: print('Preparing data...') if cfg['prepare_method'] == 1: W = anchor_words(V, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif cfg['prepare_method'] == 2: centroids, labels = reduce_cluster(V.T, cfg['T'], cfg) W = centroids.T W[W < eps] = 0 W = normalize_cols(W) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif cfg['prepare_method'] == 3: centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif cfg['prepare_method'] >= 4 and cfg['prepare_method'] <= 6: if cfg['prepare_method'] == 4: red = reduce_tsne(V, to_dim=4) elif cfg['prepare_method'] == 5: red = reduce_tsne(V, to_dim=3) elif cfg['prepare_method'] == 6: red = reduce_tsne(V, to_dim=2) centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) nearest_words = find_nearest(red, centroids, labels) V_reduced = normalize_cols(V[nearest_words, :]) W = anchor_words(V_reduced, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif cfg['prepare_method'] == 10: centroids, labels = reduce_multi_cluster(V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) #W = restore_multi_cluster(W, labels, cfg) W = linalg.solve(dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T W[W < eps] = 0 W = normalize_cols(W) else: (W, H) = gen_init(cfg) #cur_frob = measure.frobenius(V, np.dot(W, H)) #for init_it in xrange(200): # W_new, H_new = gen_init(cfg) # if measure.frobenius(V, np.dot(W_new, H_new)) #< cur_frob: # W = deepcopy(W_new) # H = deepcopy(H_new) se = time() - st print('Preparing took time:', timedelta(seconds=se)) #labels=None #print('Preparing data...') #centroids, labels = reduce_cluster(V, cfg['T'], cfg) #H = centroids #H[H < eps] = 0 #H = normalize_cols(H) #print('Solving for W') #W = linalg.solve(dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T #W[W < eps] = 0 #W = normalize_cols(W) #centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) #W = anchor_words(centroids, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) #H[H < eps] = 0 #H = normalize_cols(H) #W = restore_cluster(W, labels, cfg) #W = anchor_words(V, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) #H[H < eps] = 0 #H = normalize_cols(H) #red = reduce_tsne(V, to_dim=3) #centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) #print('c', centroids.shape, 'l', labels.shape) #nearest_words = find_nearest(red, centroids, labels) #print('nw:', nearest_words.shape) #V_reduced = V[nearest_words, :] #print('Vr', V_reduced.shape) #W = anchor_words(V_reduced, 'L2', cfg) #print('Solving for H') #H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) #H[H < eps] = 0 #H = normalize_cols(H) #W = restore_cluster(W, labels, cfg) if cfg['compare_prepare'] > 0: if r > 0: print('Preparing data...') if r == 1: W = anchor_words(V, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V)) H[H < eps] = 0 H = normalize_cols(H) elif r == 2: centroids, labels = reduce_cluster(V, cfg['T'], cfg) H = centroids H[H < eps] = 0 H = normalize_cols(H) print('Solving for W') W = linalg.solve(dot(H, H.T) + eye(H.shape[0]) * eps, dot(H, V.T)).T W[W < eps] = 0 W = normalize_cols(W) elif r == 3: centroids, labels = reduce_cluster(V, cfg['num_clusters'], cfg) W = anchor_words(centroids, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, normalize_cols(centroids))) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) elif r >= 4 and r <= 6: if r == 4: red = reduce_tsne(V, to_dim=4) elif r == 5: red = reduce_tsne(V, to_dim=3) elif r == 6: red = reduce_tsne(V, to_dim=2) centroids, labels = reduce_cluster(red, cfg['num_clusters'], cfg) nearest_words = find_nearest(red, centroids, labels) V_reduced = V[nearest_words, :] W = anchor_words(V_reduced, 'L2', cfg) print('Solving for H') H = linalg.solve(np.dot(W.T, W) + np.eye(W.shape[1]) * eps, np.dot(W.T, V_reduced)) H[H < eps] = 0 H = normalize_cols(H) W = restore_cluster(W, labels, cfg) if cfg['compare_methods'] > 0: cfg['schedule'] = methods[r % nmethods] start = time() (val, hdist, it, W, H, status) = run(V, W, H , W_r, H_r, cfg) stop = time() print('Run time:', timedelta(seconds=stop - start)) exp_time[r] = stop - start res[r] = val hdist_runs[r] = hdist if cfg['print_lvl'] > 0: print(' Result:', val[-1, :]) for i, fun_name in enumerate(cfg['finals'].split(',')): #val = np.array([r[:, i] for r in res]) fun = getattr(measure, fun_name) name, val = fun(W, H) print(name, ':', val) print(cfg['experiment']) if cfg['experiment'] == '': exp_name = 'test' else: exp_name = cfg['experiment'] #if cfg['save_results']: # if cfg['save_file']: # results_file = cfg['save_file'] # with open(results_file, 'w') as rf: # print_head(rf) # print('# Generated on {}'.format(datetime.today()), file=rf) # print('# Experiments config:', file=rf) # print('# Number of experiments: {}'.format(cfg['runs']), file=rf) # print('# Methods schedule: {}'.format(cfg['schedule']), file=rf) # print('# Iterations number: {}'.format(cfg['max_iter']), file=rf) # print('# All experiments done in {}'.format( # timedelta(seconds=sum(exp_time))), file=rf) # print_head(rf) # for r in range(cfg['runs']): # print('# Run #{}. Done in {}'.format(r+1, # timedelta(seconds=exp_time[r])), file=rf) # [print(val, file=rf) for val in res[:, r]] # print_head(rf) if cfg['show_results']: if not os.path.exists(cfg['result_dir']): os.makedirs(cfg['result_dir']) np.savetxt(join(cfg['result_dir'], cfg['experiment'] + '_W.csv'), W) #show_topics(W, 25, vocab=vocab) save_topics(W, join(cfg['result_dir'], cfg['experiment'] + '_topics.txt'), vocab) #plot_matrix(V, 'Documents', labels=labels, vocab=vocab) #filename = os.path.join(cfg['result_dir'], cfg['experiment']+'_V.eps') #plt.savefig(filename, format='eps') #plot_matrix(W, u'Распределение слов в темах', labels, vocab) #filename = os.path.join(cfg['result_dir'], cfg['experiment']+'_W.pdf') #plt.savefig(filename, format='pdf') for i, fun_name in enumerate(cfg['measure'].split(',')): val = np.array([r[:, i] for r in res]) fun = getattr(measure, fun_name + '_name') plot_measure(val.T, fun()) filename = os.path.join(cfg['result_dir'], cfg['experiment']+'_'+fun_name+'.pdf') plt.savefig(filename, format='pdf') if cfg['compare_real']: print('Hellinger res:', hdist_runs[0][-1,0]) plot_measure(np.array([r[:, 0] for r in hdist_runs]).T, measure.hellinger_name()) show_matrices_recovered(W_r, H_r, W, H, cfg, permute=True) #plt.savefig('tm_tests/recovered_cnmf_' + tp + '.eps', format='eps') #plt.show() return res
def plsa(V, W, H, post='', cfg=config.default_config()): eps = cfg['eps'] tmp = V / maximum(dot(W, H), eps) H = normalize_cols(H * dot(W.T, tmp)) W = normalize_cols(W * dot(tmp, H.T)) return W, H
def restore_cluster(W_reduced, labels, params): W = zeros((params['N'], params['T'])) for word, label in enumerate(labels): W[word, :] = W_reduced[label, :] return normalize_cols(W)
def run(V, W, H, W_r=None, H_r=None, cfg=config.default_config()): T = H.shape[0] eps = cfg['eps'] schedule = cfg['schedule'].split(',') meas = cfg['measure'].split(',') val = np.zeros((cfg['max_iter']+2, len(meas))) hdist = np.zeros((cfg['max_iter']+2, 1)) for i, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[0, i] = fun(V, np.dot(W, H)) if cfg['compare_real']: #m = Munkres() idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[0] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T if cfg['print_lvl'] > 1: print('Initial loss:', val[0]) status = 0 methods_num = len(schedule) it = -1 for it in range(cfg['max_iter']): if cfg['print_lvl'] > 1: print('Iteration', it+1) W_old = deepcopy(W) H_old = deepcopy(H) method_name = schedule[it % methods_num] if cfg['print_lvl'] > 1: print('Method:', method_name) method = getattr(methods, method_name) (W, H) = method(V, W, H, method_name, cfg) if (it+1) % cfg['normalize_iter'] == 0: W = normalize_cols(W) H = normalize_cols(H) for j, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[it+1, j] = fun(V, np.dot(W, H)) if cfg['compare_real']: idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[it+1] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T if cfg['print_lvl'] > 1: print(val[it+1]) if all(val[it, :] < eps): if cfg['print_lvl'] > 1: print('By cost.') status = 1 break if abs(W_old - W).max() < eps and abs(H_old - H).max() < eps: if cfg['print_lvl'] > 1: print('By argument.') status = 2 break #del W_old #del H_old if cfg['print_lvl'] > 1: print('Final:') W = normalize_cols(W) H = normalize_cols(H) for j, fun_name in enumerate(meas): fun = getattr(measure, fun_name) val[it+2:, j] = fun(V, np.dot(W, H)) if cfg['compare_real']: idx = get_permute(W_r, H_r, W, H, cfg['munkres']) hdist[it+2:] = hellinger(W[:, idx[:, 1]], W_r[:, idx[:, 0]]) / T return (val, hdist, it, W, H, status)
def plsa(F, Phi, Theta, post='', cfg=config.default_config()): eps = cfg['eps'] tmp = F / maximum(dot(Phi, Theta), eps) Theta, Phi = normalize_cols(Theta * dot(Phi.T, tmp)), normalize_cols(Phi * dot(tmp, Theta.T)) return Phi, Theta
def gen_matrix_uniform(params): return normalize_cols(np.random.uniform(size=(params['rows'], params['cols'])))
def initialize_matrices(i, F, cfg=config.default_config()): """Initialize matrices Phi Theta. - Return: Phi Theta - Used params: prepare_method """ if (int(cfg['prepare_method'].split(',')[i]) == 1): print("Arora") eps = cfg['eps'] F_norm = normalize_cols(F) Phi = prepare.anchor_words(F_norm, 'L2', cfg) print('Solving for Theta') Theta = np.linalg.solve(np.dot(Phi.T, Phi) + np.eye(Phi.shape[1]) * eps, np.dot(Phi.T, F_norm)) Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) return Phi, Theta elif (int(cfg['prepare_method'].split(',')[i]) == 2): print("Random rare") cfg['phi_sparsity'] = 0.05 cfg['theta_sparsity'] = 0.1 return gen_init(cfg) elif (int(cfg['prepare_method'].split(',')[i]) == 3): print("Random uniform") cfg['phi_sparsity'] = 1. cfg['theta_sparsity'] = 1. return gen_init(cfg) elif (int(cfg['prepare_method'].split(',')[i]) == 4): eps = cfg['eps'] F_norm = normalize_cols(F) print("Clustering of words") centroids, labels = prepare.reduce_cluster(F_norm, cfg['T'], cfg) Theta = centroids Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) print('Solving for Phi') Phi = np.transpose(np.linalg.solve(np.dot(Theta, Theta.T) + np.eye((Theta.T).shape[1]) * eps, np.dot(Theta, F_norm.T))) Phi[Phi < eps] = 0 Phi = normalize_cols(Phi) return Phi, Theta elif (int(cfg['prepare_method'].split(',')[i]) == 5): eps = cfg['eps'] F_norm = normalize_cols(F) print("SVD init") U, s, V = np.linalg.svd(F_norm) Phi, Theta = construct_from_svd(U, s, V, cfg) return Phi, Theta elif (int(cfg['prepare_method'].split(',')[i]) == 6): eps = cfg['eps'] transformer = TfidfTransformer() transformer.fit(F) F_tfidf = (transformer.transform(F)).toarray() print("Clustering of tf-idf") centroids, labels = prepare.reduce_cluster(F_tfidf, cfg['T'], cfg) Theta = centroids Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) print('Solving for Phi') Phi = np.transpose(np.linalg.solve(np.dot(Theta, Theta.T) + np.eye((Theta.T).shape[1]) * eps, np.dot(Theta, F_tfidf.T))) Phi[Phi < eps] = 0 Phi = normalize_cols(Phi) return Phi, Theta elif (int(cfg['prepare_method'].split(',')[i]) == 7): eps = cfg['eps'] F_norm = normalize_cols(F) print("Clustering of words mixed") centroids, labels = prepare.reduce_cluster(F_norm, cfg['T'], cfg) Theta = centroids Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) print('Solving for Phi') Phi = np.transpose(np.linalg.solve(np.dot(Theta, Theta.T) + np.eye((Theta.T).shape[1]) * eps, np.dot(Theta, F_norm.T))) Phi[Phi < eps] = 0 Phi = normalize_cols(Phi) cfg['phi_sparsity'] = 1. cfg['theta_sparsity'] = 1. Phi1, Theta1 = gen_init(cfg) zzz = 0.3 return zzz*Phi1+(1.-zzz)*Phi, zzz*Theta1+(1.-zzz)*Theta elif (int(cfg['prepare_method'].split(',')[i]) == 8): print("Arora mixed") eps = cfg['eps'] F_norm = normalize_cols(F) Phi = prepare.anchor_words(F_norm, 'L2', cfg) print('Solving for Theta') Theta = np.linalg.solve(np.dot(Phi.T, Phi) + np.eye(Phi.shape[1]) * eps, np.dot(Phi.T, F_norm)) Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) cfg['phi_sparsity'] = 1. cfg['theta_sparsity'] = 1. Phi1, Theta1 = gen_init(cfg) zzz = 0.3 return zzz*Phi1+(1.-zzz)*Phi, zzz*Theta1+(1.-zzz)*Theta elif (int(cfg['prepare_method'].split(',')[i]) == 9): print("Arora unifrom") eps = cfg['eps'] F_norm = normalize_cols(F) Phi = prepare.anchor_words(F_norm, 'L2', cfg) print('Solving for Theta') Theta = np.ones((Phi.shape[1], F.shape[1])) Theta = normalize_cols(Theta) return Phi, Theta elif (int(cfg['prepare_method'].split(',')[i]) == 10): eps = cfg['eps'] F_norm = normalize_cols(F) print("Clustering of docs") centroids, labels = prepare.reduce_cluster(F_norm.T, cfg['T'], cfg) Phi = centroids.T Phi[Phi < eps] = 0 Phi = normalize_cols(Phi) print('Solving for Theta') Theta = np.linalg.solve(np.dot(Phi.T, Phi) + np.eye(Phi.shape[1]) * eps, np.dot(Phi.T, F_norm)) Theta[Theta < eps] = 0 Theta = normalize_cols(Theta) return Phi, Theta
def restore_multi_cluster(W_reduced, labels, params): W = zeros((params['N'], params['T'])) for word in xrange(W.shape[0]): W[word, :] = mean(W_reduced[labels[word, :], :]) return normalize_cols(W)