if args.bow_norm: normalized_data_batch = data_batch / sums else: normalized_data_batch = data_batch theta, _ = model.get_theta(normalized_data_batch) thetaAvg += theta.sum(0).unsqueeze(0) / args.num_docs_train weighed_theta = sums * theta thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0) if idx % 100 == 0 and idx > 0: print('batch: {}/{}'.format(idx, len(indices))) thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt print('\nThe 10 most used topics are {}'.format( thetaWeightedAvg.argsort()[::-1][:10])) ## show topics beta = model.get_beta() topic_indices = list(np.random.choice(args.num_topics, 10)) # 10 random topics print('\n') for k in range(args.num_topics): #topic_indices: gamma = beta[k] top_words = list(gamma.cpu().numpy().argsort()[-args.num_words + 1:][::-1]) topic_words = [vocab[a] for a in top_words] print('Topic {}: {}'.format(k, topic_words)) if args.train_embeddings: ## show etm embeddings try: rho_etm = model.rho.weight.cpu() except:
# get the theta theta, _ = etm_model.get_theta(normalized_data_batch) thetaAvg += theta.sum(0).unsqueeze(0) / num_docs_train weighed_theta = sums * theta thetaWeightedAvg += weighed_theta.sum(0).unsqueeze(0) # let's print the progress as we go if idx % 100 == 0 and idx > 0: print('batch: {}/{}'.format(idx, len(indices))) # finally the results are in thetaWeightedAvg = thetaWeightedAvg.squeeze().cpu().numpy() / cnt print('\nThe 10 most used topics are {}'.format( thetaWeightedAvg.argsort()[::-1][:10])) # Now we show the topics # A nice visualisation is always welcome beta = etm_model.get_beta() topic_indices = list(np.random.choice(num_topics, 10)) # 10 random topics print('\n') for k in range(num_topics): # topic_indices: gamma = beta[k] top_words = list(gamma.cpu().numpy().argsort()[-num_words + 1:][::-1]) topic_words = [idx2word[a] for a in top_words] print('Topic {}: {}'.format(k, topic_words)) # Why not, also showing a few embeddings if train_embeddings: # get embeddings from the model try: rho_etm = etm_model.rho.weight.cpu() except: rho_etm = etm_model.rho.cpu()