def anchor_model(collection, wrd_count, num_topics=100, metrics=None, k=400, verbose=False, noun=False, bi=False): logger.info('Start anchor_model') logger.info('Create bag of words') bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(collection.documents_test) logger.info('Build word x documents matrix') m_mtx = m_matrix(bw_train, wrd_count) if bi: m_mtx = add_bigramm_to_m(m_mtx, collection) logger.info('Build cov matrix') cov_matrix = topic_cov_mtx(m_mtx) logger.info('Find anch words candidat') find_cand = find_candidate if not noun else find_candidate_noun candidate_anchors = find_cand(m_mtx, collection, k=k) logger.info('Find anch words') anchors = find_anchors(cov_matrix, candidate_anchors, num_topics) logger.info('Recover word x topic matrix') word_topic = recover_word_topic(cov_matrix, anchors, collection.num_wrd) if metrics: logger.info('Eval metrics') metric_val = [metric(word_topic, collection.documents_train, collection.documents_test) for metric in metrics] print 'end: %s' % ' '.join(metric_val) return word_topic, anchors
def preplexity(word_topic, train, test): bw_train, bw_test = bag_of_words(train), bag_of_words(test) topic_document_train, _ = estimate_teta(word_topic, bw_train) train_perpl = _preplexity(word_topic, topic_document_train, bw_train) topic_document_test, _ = estimate_teta(word_topic, bw_test) test_perpl = _preplexity(word_topic, topic_document_test, bw_test) return 'preplexity train = %.2f, test = %.2f' % (train_perpl, test_perpl)
def plsa_model(collection, wrd_count, num_topics=100, num_iter=10, metrics=None, verbose=False, F=None): logger.info('Start plsa_model') logger.info('Create bag of words') bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(collection.documents_test) doc_count = len(bw_train) if F is None: F, T = norn_mtx(wrd_count, num_topics, axis='x'), norn_mtx(num_topics, doc_count, axis='y') else: T = estimate_teta_full(F, bw_train) if not(metrics and verbose): bar = Bar('Processing', max=num_iter) logger.info('Begin itters') for itter in xrange(num_iter): Nwt, Ntd = np.zeros((wrd_count, num_topics)), np.zeros((num_topics, doc_count)) Nt, Nd = np.zeros(num_topics), np.zeros(doc_count) for d in xrange(doc_count): for w, ndw in bw_train[d]: ndwt = F[w, :] * T[:, d] ndwt *= ndw * (1.0 / ndwt.sum()) Nwt[w] += ndwt Ntd[:, d] += ndwt Nt += ndwt Nd[d] += ndwt.sum() for w in xrange(wrd_count): F[w] = Nwt[w] / Nt for t in range(num_topics): T[t] = Ntd[t] / Nd if metrics and verbose: metric_val = [metric(F, collection.documents_train, collection.documents_test) for metric in metrics] print 'iter %s: %s' % (str(itter).zfill(2), ' '.join(metric_val)) else: bar.next() if not(metrics and verbose): bar.finish() if metrics: logger.info('Eval metrics') metric_val = [metric(F, collection.documents_train, collection.documents_test) for metric in metrics] print 'end: %s' % ' '.join(metric_val) return F, T
def anchor_model(collection, wrd_count, num_topics=100, metrics=None, k=1000, verbose=False, noun=False, bi=False): logger.info('Start anchor_model') logger.info('Create bag of words') bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words( collection.documents_test) logger.info('Build word x documents matrix') m_mtx = m_matrix(bw_train, wrd_count) if bi: m_mtx = add_bigramm_to_m(m_mtx, collection) logger.info('Build cov matrix') cov_matrix = topic_cov_mtx(m_mtx) logger.info('Find anch words candidat') find_cand = find_candidate if not noun else find_candidate_noun candidate_anchors = find_cand(m_mtx, collection, k=k) logger.info('Find anch words') anchors = find_anchors(cov_matrix, candidate_anchors, num_topics) logger.info('Recover word x topic matrix') word_topic = recover_word_topic(cov_matrix, anchors, collection.num_wrd) if metrics: logger.info('Eval metrics') metric_val = [ metric(word_topic, collection.documents_train, collection.documents_test) for metric in metrics ] print 'end: %s' % ' '.join(metric_val) return word_topic, anchors
def plsa_model(collection, wrd_count, num_topics=100, num_iter=10, metrics=None, verbose=False, F=None): logger.info('Start plsa_model') logger.info('Create bag of words') bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words( collection.documents_test) doc_count = len(bw_train) if F is None: F, T = norn_mtx(wrd_count, num_topics, axis='x'), norn_mtx(num_topics, doc_count, axis='y') else: T = estimate_teta_full(F, bw_train) if not (metrics and verbose): bar = Bar('Processing', max=num_iter) logger.info('Begin itters') for itter in xrange(num_iter): Nwt, Ntd = np.zeros((wrd_count, num_topics)), np.zeros( (num_topics, doc_count)) Nt, Nd = np.zeros(num_topics), np.zeros(doc_count) for d in xrange(doc_count): for w, ndw in bw_train[d]: ndwt = F[w, :] * T[:, d] ndwt *= ndw * (1.0 / ndwt.sum()) Nwt[w] += ndwt Ntd[:, d] += ndwt Nt += ndwt Nd[d] += ndwt.sum() for w in xrange(wrd_count): F[w] = Nwt[w] / Nt for t in range(num_topics): T[t] = Ntd[t] / Nd if metrics and verbose: metric_val = [ metric(F, collection.documents_train, collection.documents_test) for metric in metrics ] print 'iter %s: %s' % (str(itter).zfill(2), ' '.join(metric_val)) else: bar.next() if not (metrics and verbose): bar.finish() if metrics: logger.info('Eval metrics') metric_val = [ metric(F, collection.documents_train, collection.documents_test) for metric in metrics ] print 'end: %s' % ' '.join(metric_val) return F, T