Python bag_of_words示例，tmtk.collection.collection.bag_of_words Python示例

示例#1

0

显示文件

文件： anchor.py 项目： pavlikzlo/tmtk

def anchor_model(collection, wrd_count, num_topics=100, metrics=None, k=400, verbose=False, noun=False, bi=False):
    logger.info('Start anchor_model')

    logger.info('Create bag of words')
    bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(collection.documents_test)

    logger.info('Build word x documents matrix')
    m_mtx = m_matrix(bw_train, wrd_count)
    if bi: m_mtx = add_bigramm_to_m(m_mtx, collection)

    logger.info('Build cov matrix')
    cov_matrix = topic_cov_mtx(m_mtx)

    logger.info('Find anch words candidat')

    find_cand = find_candidate if not noun else find_candidate_noun
    candidate_anchors = find_cand(m_mtx, collection, k=k)

    logger.info('Find anch words')
    anchors = find_anchors(cov_matrix, candidate_anchors, num_topics)

    logger.info('Recover word x topic matrix')
    word_topic = recover_word_topic(cov_matrix, anchors, collection.num_wrd)

    if metrics:
        logger.info('Eval metrics')
        metric_val = [metric(word_topic, collection.documents_train, collection.documents_test) for metric in metrics]
        print 'end: %s' % ' '.join(metric_val)

    return word_topic, anchors

示例#2

0

显示文件

def preplexity(word_topic, train, test):
    bw_train, bw_test = bag_of_words(train), bag_of_words(test)

    topic_document_train, _ = estimate_teta(word_topic, bw_train)
    train_perpl = _preplexity(word_topic, topic_document_train, bw_train)

    topic_document_test, _ = estimate_teta(word_topic, bw_test)
    test_perpl = _preplexity(word_topic, topic_document_test, bw_test)

    return 'preplexity train = %.2f, test = %.2f' % (train_perpl, test_perpl)

示例#3

0

显示文件

文件： metrics.py 项目： pavlikzlo/tmtk

def preplexity(word_topic, train, test):
    bw_train, bw_test = bag_of_words(train), bag_of_words(test)

    topic_document_train, _ = estimate_teta(word_topic, bw_train)
    train_perpl = _preplexity(word_topic, topic_document_train, bw_train)

    topic_document_test, _ = estimate_teta(word_topic, bw_test)
    test_perpl = _preplexity(word_topic, topic_document_test, bw_test)

    return 'preplexity train = %.2f, test = %.2f' % (train_perpl, test_perpl)

示例#4

0

显示文件

文件： plsa.py 项目： pavlikzlo/tmtk

def plsa_model(collection, wrd_count, num_topics=100, num_iter=10, metrics=None, verbose=False, F=None):
    logger.info('Start plsa_model')

    logger.info('Create bag of words')
    bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(collection.documents_test)

    doc_count = len(bw_train)

    if F is None:
        F, T = norn_mtx(wrd_count, num_topics, axis='x'), norn_mtx(num_topics, doc_count, axis='y')
    else:
        T = estimate_teta_full(F, bw_train)

    if not(metrics and verbose):
        bar = Bar('Processing', max=num_iter)

    logger.info('Begin itters')
    for itter in xrange(num_iter):
        Nwt, Ntd = np.zeros((wrd_count, num_topics)), np.zeros((num_topics, doc_count))
        Nt, Nd = np.zeros(num_topics), np.zeros(doc_count)

        for d in xrange(doc_count):
            for w, ndw in bw_train[d]:
                ndwt = F[w, :] * T[:, d]
                ndwt *= ndw * (1.0 / ndwt.sum())

                Nwt[w] += ndwt
                Ntd[:, d] += ndwt
                Nt += ndwt
                Nd[d] += ndwt.sum()

        for w in xrange(wrd_count):
            F[w] = Nwt[w] / Nt

        for t in range(num_topics):
            T[t] = Ntd[t] / Nd

        if metrics and verbose:
            metric_val = [metric(F, collection.documents_train, collection.documents_test) for metric in metrics]
            print 'iter %s: %s' % (str(itter).zfill(2), ' '.join(metric_val))
        else:
            bar.next()

    if not(metrics and verbose):
        bar.finish()

    if metrics:
        logger.info('Eval metrics')
        metric_val = [metric(F, collection.documents_train, collection.documents_test) for metric in metrics]
        print 'end: %s' % ' '.join(metric_val)

    return F, T

示例#5

0

显示文件

def anchor_model(collection,
                 wrd_count,
                 num_topics=100,
                 metrics=None,
                 k=1000,
                 verbose=False,
                 noun=False,
                 bi=False):
    logger.info('Start anchor_model')

    logger.info('Create bag of words')
    bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(
        collection.documents_test)

    logger.info('Build word x documents matrix')
    m_mtx = m_matrix(bw_train, wrd_count)
    if bi: m_mtx = add_bigramm_to_m(m_mtx, collection)

    logger.info('Build cov matrix')
    cov_matrix = topic_cov_mtx(m_mtx)

    logger.info('Find anch words candidat')

    find_cand = find_candidate if not noun else find_candidate_noun
    candidate_anchors = find_cand(m_mtx, collection, k=k)

    logger.info('Find anch words')
    anchors = find_anchors(cov_matrix, candidate_anchors, num_topics)

    logger.info('Recover word x topic matrix')
    word_topic = recover_word_topic(cov_matrix, anchors, collection.num_wrd)

    if metrics:
        logger.info('Eval metrics')
        metric_val = [
            metric(word_topic, collection.documents_train,
                   collection.documents_test) for metric in metrics
        ]
        print 'end: %s' % ' '.join(metric_val)

    return word_topic, anchors

示例#6

0

显示文件

def plsa_model(collection,
               wrd_count,
               num_topics=100,
               num_iter=10,
               metrics=None,
               verbose=False,
               F=None):
    logger.info('Start plsa_model')

    logger.info('Create bag of words')
    bw_train, bw_test = bag_of_words(collection.documents_train), bag_of_words(
        collection.documents_test)

    doc_count = len(bw_train)

    if F is None:
        F, T = norn_mtx(wrd_count, num_topics, axis='x'), norn_mtx(num_topics,
                                                                   doc_count,
                                                                   axis='y')
    else:
        T = estimate_teta_full(F, bw_train)

    if not (metrics and verbose):
        bar = Bar('Processing', max=num_iter)

    logger.info('Begin itters')
    for itter in xrange(num_iter):
        Nwt, Ntd = np.zeros((wrd_count, num_topics)), np.zeros(
            (num_topics, doc_count))
        Nt, Nd = np.zeros(num_topics), np.zeros(doc_count)

        for d in xrange(doc_count):
            for w, ndw in bw_train[d]:
                ndwt = F[w, :] * T[:, d]
                ndwt *= ndw * (1.0 / ndwt.sum())

                Nwt[w] += ndwt
                Ntd[:, d] += ndwt
                Nt += ndwt
                Nd[d] += ndwt.sum()

        for w in xrange(wrd_count):
            F[w] = Nwt[w] / Nt

        for t in range(num_topics):
            T[t] = Ntd[t] / Nd

        if metrics and verbose:
            metric_val = [
                metric(F, collection.documents_train,
                       collection.documents_test) for metric in metrics
            ]
            print 'iter %s: %s' % (str(itter).zfill(2), ' '.join(metric_val))
        else:
            bar.next()

    if not (metrics and verbose):
        bar.finish()

    if metrics:
        logger.info('Eval metrics')
        metric_val = [
            metric(F, collection.documents_train, collection.documents_test)
            for metric in metrics
        ]
        print 'end: %s' % ' '.join(metric_val)

    return F, T