예제 #1
0
def run_gdmr(input_file,
             topics,
             degrees=[],
             optim_interval=20,
             burn_in=200,
             alpha=1e-2,
             sigma=0.25,
             sigma0=3.0,
             md_range=None,
             iteration=800,
             save_path=None):

    print(md_range)

    corpus = tp.utils.Corpus()
    for line in open(input_file, encoding='utf-8'):
        fd = line.strip().split()
        corpus.add_doc(fd[2:], metadata=list(map(float, fd[:2])))

    mdl = tp.GDMRModel(k=topics,
                       degrees=degrees,
                       alpha=alpha,
                       sigma=sigma,
                       sigma0=sigma0,
                       metadata_range=md_range,
                       corpus=corpus)
    mdl.optim_interval = optim_interval
    mdl.burn_in = burn_in

    mdl.train(0)

    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs),
          ', Num words:', mdl.num_words)
    for i in range(0, iteration, 20):
        print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
        mdl.train(min(iteration - i, 20))
    print('Iteration: {:04} LL per word: {:.4}'.format(iteration,
                                                       mdl.ll_per_word))

    if save_path: mdl.save(save_path)
    return mdl
예제 #2
0
                                             N=1024)
'''
You can get the sample data file from https://github.com/bab2min/g-dmr/tree/master/data .
'''

corpus = tp.utils.Corpus()
for line in open('dataset2.txt', encoding='utf-8'):
    fd = line.strip().split()
    corpus.add_doc(fd[2:], numeric_metadata=list(map(float, fd[:2])))

# We set a range of the first metadata as [2000, 2017]
# and one of the second metadata as [0, 1].
mdl = tp.GDMRModel(tw=tp.TermWeight.PMI,
                   k=30,
                   degrees=[4, 3],
                   alpha=1e-2,
                   sigma=0.25,
                   sigma0=3.0,
                   metadata_range=[(2000, 2017), (0, 1)],
                   corpus=corpus)
mdl.optim_interval = 20
mdl.burn_in = 200

mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
예제 #3
0
corpus = tp.utils.Corpus()
for line in open('text_mining_year_journal.txt', encoding='utf-8'):
    fd = line.strip().split('\t', maxsplit=2)
    corpus.add_doc(fd[2].split(),
                   numeric_metadata=[float(fd[0])],
                   metadata=fd[1])
# Use the argument `numeric_metadata` for continuous numerical metadata (list of float type),
# and the argument `metadata` for categorical metadata (str type)

# We set a range of the numeric metadata as [2000, 2017].
# `decay=1.0` penalizes higher-order terms of lambdas to prevent overfitting.
mdl = tp.GDMRModel(tw=tp.TermWeight.ONE,
                   k=30,
                   degrees=[6],
                   alpha=1e-2,
                   sigma=0.25,
                   sigma0=3.0,
                   decay=1.0,
                   metadata_range=[(2000, 2017)],
                   corpus=corpus)
mdl.optim_interval = 20
mdl.burn_in = 200

mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words))

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04} LL per word: {:.4}'.format(i, mdl.ll_per_word))
예제 #4
0
파일: gdmr.py 프로젝트: ankane/tomoto
import tomotopy as tp

model = tp.GDMRModel(degrees=[1])
print(model.alpha)
print(model.alpha_epsilon)
print(model.degrees)
print(model.eta)
print(model.f)
print(model.sigma)
print(model.sigma0)