def pp_inference_tm(state_dwz_, dict_params_, dict_args_):
    try:
        path_tm_ = dict_args_['path_tm']
    except KeyError:
        print('specify path to tm. default might not work.')
    # # parameters
    D_ = dict_params_['D']
    V_ = dict_params_['V']
    K_ = dict_params_['K']

    # # convert state into corpus
    n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_, D_, V_, K_)
    texts_ = nwd_to_texts(n_wd_)

    state_dwz_infer_, n_wj_infer_, n_jd_infer_, K_infer_ = tm_inference(
        path_tm_, texts_)
    # # get the nmi and the rest
    nmi = state_dwz_nmi(state_dwz_, state_dwz_infer_, K_, K_infer_)

    # #
    p_dt_infer = np.transpose(n_jd_infer_ / float(np.sum(n_jd_infer_)))

    list_t_d_infer = predict_topic_p_td_unsup(p_dt_infer)

    return nmi, K_infer_, p_dt_infer, list_t_d_infer
def make_dict_corpus_for_inference(dict_output_corpus):
    '''
    Take output from topicmodel_synthetic_front [generating the synthetic corpus]
    and put in the form that it can be used by topicmodel_inference_front [inferring the synthetic corpus]
    IN:
    -dict, contains 'state_dwz','p_wt','p_td'
    OUT:
    - dict, contains  'texts_list_shuffle', 'state_dwz_shuffle'
    '''
    state_dwz = dict_output_corpus['state_dwz']
    p_wt = dict_output_corpus['p_wt']
    p_td = dict_output_corpus['p_td']
    V, K = np.shape(p_wt)
    K, D = np.shape(p_td)
    # ## convert state into corpus
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd.astype('int'))
    dict_corpus_tmp = {
        'texts_list_shuffle': texts,
        'state_dwz_shuffle': state_dwz
    }
    return dict_corpus_tmp
Пример #3
0
def synthetic_single_stopword_terminal(V=1000,
                                       K=5,
                                       D=100,
                                       m=100,
                                       dist_w='uni',
                                       dist_t=None,
                                       dist_stop='uni',
                                       p_s=0,
                                       c_w=1,
                                       c_t=None,
                                       seed=None,
                                       burstiness=None,
                                       if_info=0):
    '''
    Output:
    p_w_td: p(w|t,d), in general, for each d, p(w|t,d) = p(w|t); however, for the burstiness case, p(w|t,d) is different for each document
    '''

    if dist_t is None:
        dist_t = dist_w

    if c_t is None:
        c_t = c_w

    # Get global word distribution
    # p_w = get_global_word_distribution_pw(V, marker_pw )
    p_w = get_pw_pt(V, dist=dist_w)

    # Get stopword distribution
    # stop_distrib = get_global_word_distribution_pw(V , marker_stop )
    stop_distrib = get_pw_pt(V, dist=dist_stop)

    # Choose stopword list
    num_stopword = int(V * p_s)
    np.random.seed(seed=seed)
    stopword_list = np.random.choice(V,
                                     size=num_stopword,
                                     replace=False,
                                     p=stop_distrib)

    # Get the number of word type in each topic
    num_nonstop = V - num_stopword
    V_t = get_vt_from_nonstop(
        K, num_nonstop, dist_t
    )  # V_t is the topic size for each topic, i.e., the number of useful non-stopwords in each topic

    # Get the topic assignment for each words: both stopwords and non-stopwords.
    # For stopwords, assign a very large number as their topic id.
    word_topic_assign_list = get_word_topic_assign_list(V_t,
                                                        stopword_list,
                                                        seed=seed)

    # Get topic distribution p_t
    p_t = get_topic_distribution_p_t(K, p_w, word_topic_assign_list)

    # Get word-topic distribution
    p_wt = get_word_topic_distribution_p_wt(K, p_w, p_t,
                                            word_topic_assign_list, c_w)

    # Get the topic assignment for each document
    document_topic_assign_list = np.random.choice(K,
                                                  size=D,
                                                  replace=True,
                                                  p=p_t)

    # Get topic-document distribution
    p_td = get_topic_doc_distribution_ptd(K, p_t, c_t,
                                          document_topic_assign_list)

    # Get the synthetic corpus
    state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td,
                                              p_wt,
                                              m,
                                              burstiness=burstiness)
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd)

    # Get the output dictionarr

    dict_out = {}
    dict_out['p_w'] = p_w
    dict_out['V_t'] = V_t
    dict_out['word_topic_assign_list'] = word_topic_assign_list
    dict_out['p_t'] = p_t
    dict_out['p_wt'] = p_wt
    dict_out['p_w_td'] = p_w_td
    dict_out['document_topic_assign_list'] = document_topic_assign_list
    dict_out['p_td'] = p_td
    dict_out['state_dwz'] = state_dwz
    dict_out['n_wd'] = n_wd
    dict_out['n_wj'] = n_wj
    dict_out['n_jd'] = n_jd
    dict_out['texts'] = texts

    # Get the structure
    if if_info:
        DeltaI, I_alpha = deltaI_from_nwd(n_wd)
        dict_out['DeltaI'] = DeltaI
        dict_out['I_alpha'] = I_alpha

    return dict_out
def synthetic_dirichlet_terminal(V,
                                 K,
                                 dist_w,
                                 D,
                                 m,
                                 alpha,
                                 beta=None,
                                 dist_t=None,
                                 seed=None,
                                 burstiness=None,
                                 if_info=True):

    if dist_t is None:
        dist_t = dist_w
    if beta is None:
        beta = 1.0 * alpha

    # # global distribution of topic-size
    p_t = get_pw_pt(K, dist=dist_t)
    # # global distribution of word frequencies
    p_w = get_pw_pt(V, dist=dist_w)

    # # get vector-hyperparameters
    vec_alpha = make_hyper_vec(alpha, p_t)
    vec_beta = make_hyper_vec(beta, p_w)

    # # create the mixture-matrices p_wt (word-topic) and p_td (topic-doc)
    p_td, p_wt = make_pwt_ptd_dirichlet(vec_alpha, vec_beta, D, seed=seed)

    # # draw the dwz-state
    state_dwz, p_w_td = draw_dwz_from_ptd_pwt(p_td,
                                              p_wt,
                                              m,
                                              burstiness=burstiness)
    n_wd, n_wj, n_jd = state_nwjd(state_dwz, D, V, K)
    texts = nwd_to_texts(n_wd)
    # # infer the topic-membership of each doc:
    # # choose topic with largest contribution from p(t|d)
    list_t_d_true = np.argmax(p_td, axis=0)

    # # empirical p_t and p_w; otherwise p_tw is not normalized
    p_t_emp = 1.0 / D * np.sum(p_td, axis=1)
    p_w_emp = np.sum(p_wt * p_t_emp)

    # # infer the topic-membership of each word:
    # # choose topic with largest contribution from p(t|w) = p(w|t)*p(t)/p(w)
    # p_tw = (p_wt*(p_w[:,np.newaxis]/p_t)).T
    p_tw = (p_wt.T * (p_t_emp[:, np.newaxis] / p_w_emp))
    list_t_w_true = np.argmax(p_tw, axis=0)

    # # Get the structure
    if if_info:
        DeltaI, I_alpha = deltaI_from_nwd(n_wd)
    else:
        DeltaI = 0
        I_alpha = 0

    dict_out = {}
    dict_out['p_w'] = p_w
    # dict_out['V_t'] =V_t
    dict_out['word_topic_assign_list'] = list_t_w_true
    dict_out['p_t'] = p_t
    dict_out['p_wt'] = p_wt
    dict_out['document_topic_assign_list'] = list_t_d_true
    dict_out['p_td'] = p_td
    dict_out['state_dwz'] = state_dwz
    dict_out['n_wd'] = n_wd
    dict_out['n_wj'] = n_wj
    dict_out['n_jd'] = n_jd
    dict_out['texts'] = texts
    dict_out['p_tw'] = p_tw

    dict_out['DeltaI'] = DeltaI
    dict_out['I_alpha'] = I_alpha

    return dict_out
def tm_inference_terminal(texts,
                          state_dwz_true=None,
                          k_true=None,
                          flag_coherence=0,
                          path_tm=os.path.abspath(
                              os.path.join(os.pardir,
                                           'src/external/topicmapping'))):
    '''
    Do the inference for p_dt and  state_dwz_ (optional)

    Input:

        ## provide corpus and number of topics if need
        , 'texts':texts

        ## optional, only works for synthetic corpus with token labeling
        , 'state_dwz_true': state_dwz
        , 'k_true': K

        ## optional
        , 'path_tm': os.path.abspath(os.path.join(os.pardir,'src/external/topicmapping'))


    Output:
        dict_output = {
            'p_td_infer': p_td ## p_td inferred by topic modle
            , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data
            , 'k_infer': inferred number of topics
        }
    '''

    #############################
    # # Generate a empty dic for output
    dict_output = {}

    #############################
    # # inference for p_dt

    train_dir = make_path_tmp_tm()
    train_fname = texts_corpus_tm(texts, train_dir)
    dir_cwd = os.getcwd()

    os.chdir(path_tm)
    cmd_tm = './bin/topicmap -f %s -o %stest_result' % (train_fname, train_dir)
    # os.system(cmd_tm)
    p = subprocess.Popen(cmd_tm,
                         shell=True,
                         stdout=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL)
    p.wait()
    os.chdir(dir_cwd)

    # ############################
    # # get p_td_tm:

    p_td_tm = tm_inference_get_p_td_tm(train_dir)
    dict_output['p_td_infer'] = p_td_tm

    # ############################
    # # get p_wt_tm:
    p_wt_tm = tm_inference_get_p_wt_tm(train_dir)
    dict_output['p_wt_infer'] = p_wt_tm

    # ############################
    # # get the number of topics:
    f = open(train_dir + 'test_result/lda_class_words.txt', 'r')
    x = f.readlines()
    f.close()
    k_tm = len(x)
    dict_output['k_infer'] = k_tm

    if flag_coherence == 1:
        state_dwz_tm = tm_inference_get_state_dwz_tm(train_dir)
        dict_gs = corpora.Dictionary(texts)
        all_terms = list(dict_gs.iterkeys())
        V = len(all_terms)
        D = len(texts)
        n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_tm, D, V, k_tm)
        dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_)

    # ############################

    # # Get the nmi for token_labeling
    state_dwz_tm = tm_inference_get_state_dwz_tm(train_dir)
    dict_output['state_dwz_infer'] = state_dwz_tm

    if state_dwz_true is not None:

        dict_output_token_labeling = get_dict_output_token_labeling(
            state_dwz_true, state_dwz_tm, k_true, k_tm)

        dict_output.update(dict_output_token_labeling)

    os.system('rm -rf %s' % (train_dir))

    return dict_output
Пример #6
0
def ldags_inference_terminal(texts,
                             input_k,
                             state_dwz_true=None,
                             k_true=None,
                             input_v=None,
                             path_mallet=None,
                             dN_opt=0,
                             iterations=1000,
                             alpha=50.0,
                             beta=0.01,
                             flag_coherence=0):
    '''
    Do the inference for p_dt and  state_dwz_ (optional)

    Input:
        ## provide corpus and number of topics if need
        , 'texts':texts
        , 'input_k': K

        ## optional, only works for synthetic corpus with token labeling
        , 'state_dwz_true': state_dwz
        , 'k_true': K

        ## optional
        , 'input_v': V  # only need for 'ldavb' token labeling
        , 'path_mallet': os.path.abspath(os.path.join(os.pardir,'src/external/mallet-2.0.8RC3/bin/mallet'))
        , 'dN_opt':0
        , 'iterations':1000

    Output:
        dict_output = {
            'p_td_infer': p_td ## p_td inferred by topic modle
            , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data
        }
    '''

    # # Generate a empty dic for output
    dict_output = {}

    # # inference for p_dt
    if input_v is not None:
        # # for the synthetic corpurs with token labeling
        dict_gs = gs.corpora.Dictionary([[str(i)] for i in range(input_v)])
    else:
        # # for real-world corpus and the synthetic corpurs without token labeling
        dict_gs = corpora.Dictionary(texts)

    corpus_gs = [dict_gs.doc2bow(text) for text in texts]

    D = len(texts)
    path_tmp = make_path_tmp_lda()
    model = LdaMallet(path_mallet,
                      corpus_gs,
                      num_topics=input_k,
                      id2word=dict_gs,
                      prefix=path_tmp,
                      iterations=iterations,
                      optimize_interval=dN_opt,
                      workers=1,
                      alpha=alpha,
                      beta=beta)

    # print("iterations limit: %s" % (iterations))

    # <<< infer p(t|d)
    fdoctopics_path = model.fdoctopics()
    with open(fdoctopics_path, "r") as text_file:
        lines = text_file.readlines()
    p_d_t_ldamallet = np.zeros([D, input_k])

    for d_num in range(D):
        t_d_oneline_str = lines[d_num]
        t_d_oneline_list = t_d_oneline_str.strip('\n').split('\t')[2:]
        for t_num in range(input_k):
            p_d_t_ldamallet[d_num, t_num] = t_d_oneline_list[t_num]

    dict_output['p_td_infer'] = p_d_t_ldamallet
    # >>>

    # <<< Get the nmi for token_labeling
    fname_labels = path_tmp + 'state.mallet.gz'
    state_dwz_infer, alpha_, beta_ = state_read_mallet(fname_labels)

    # print('set_LDAGS_alpha: %s, set_LDAVB_eta: %s' % (alpha_[0], beta_))
    # print('length_LDAGS_alpha: %s, length_LDAVB_eta: %s' % (len(alpha_), 1))

    if state_dwz_true is not None:
        # nmi = state_dwz_nmi(state_dwz_true, state_dwz_infer, k_true, input_k)

        dict_output_token_labeling = get_dict_output_token_labeling(
            state_dwz_true, state_dwz_infer, k_true, input_k)
        dict_output.update(dict_output_token_labeling)

    # In general, we do not need to output state_dwz_infer
    dict_output['state_dwz_infer'] = state_dwz_infer
    # >>>

    if flag_coherence == 1:
        all_terms = list(dict_gs.iterkeys())
        V = len(all_terms)
        n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_infer, D, V, input_k)
        dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_)

    # <<< infer p(w|t)
    D = max(np.array(state_dwz_infer)[:, 0]) + 1
    V = max(np.array(state_dwz_infer)[:, 1]) + 1
    K = input_k
    n_wd_infer, n_wj_infer, n_jd_infer = state_nwjd(state_dwz_infer, D, V, K)

    beta_tmp = beta
    beta_array = np.ones([V, 1]) * beta_tmp
    n_wj_beta_array = n_wj_infer + beta_array
    n_wj_beta_array_vector = np.sum(n_wj_infer + beta_array, axis=0)

    p_wt_infer = n_wj_beta_array / n_wj_beta_array_vector
    dict_output['p_wt_infer'] = p_wt_infer
    # >>>

    os.system('rm -rf %s' % (path_tmp))

    return dict_output
Пример #7
0
def hdp_inference_terminal(texts,
                           state_dwz_true=None,
                           k_true=None,
                           flag_coherence=0,
                           path_hdp=os.path.abspath(
                               os.path.join(
                                   os.pardir,
                                   'src/external/hdp-bleilab/hdp-faster'))):
    '''
    Do the inference for p_dt and  state_dwz_ (optional)

    Input:

        ## provide corpus and number of topics if need
        , 'texts':texts

        ## optional, only works for synthetic corpus with token labeling
        , 'state_dwz_true': state_dwz
        , 'k_true': K

        ## optional
        , 'path_hdp': os.path.abspath(os.path.join(os.pardir,'src/external/hdp-bleilab/hdp-faster'))


    Output:
        dict_output = {
              'p_td_infer': p_td ## p_td inferred by topic modle
            , 'token_labeling_nmi': nmi ## optional, results for token labeling, only for synthetic data
            , 'k_infer': number of topics inferred by topic model
        }
    '''

    #############################
    # # Generate a empty dic for output
    dict_output = {}

    # ############################
    # # inference for p_dt

    train_dir = make_path_tmp_hdp()
    train_fname = texts_corpus_hdp(texts, train_dir)
    dir_cwd = os.getcwd()
    os.chdir(path_hdp)

    cmd_hdp = './hdp --train_data %s --directory %s' % (train_fname, train_dir)

    p = subprocess.Popen(cmd_hdp,
                         shell=True,
                         stdout=subprocess.DEVNULL,
                         stderr=subprocess.DEVNULL)
    p.wait()
    os.chdir(dir_cwd)

    # # doc-topic counts
    f = open(train_dir + 'final.doc.states', 'r')
    x = f.readlines()
    f.close()
    D_ = len(x)
    K_hdp = len(x[0].split())
    p_td_hdp = np.zeros((D_, K_hdp))
    for i_d, d in enumerate(x):
        n_j_tmp = np.array([int(h_) for h_ in d.split()])
        p_td_hdp[i_d, :] = n_j_tmp / float(np.sum(n_j_tmp))
    # os.system('rm -rf %s'%( train_dir))

    dict_output['p_td_infer'] = p_td_hdp

    # ############################
    # # get the number of topics:
    f = open(train_dir + 'final.topics', 'r')
    x = f.readlines()
    f.close()
    k_hdp = len(x)
    dict_output['k_infer'] = k_hdp

    # ############################
    # # individual labels
    f = open(train_dir + 'final.word-assignments', 'r')
    header = f.readline()
    x = f.readlines()
    f.close()
    state_dwz_hdp = [tuple([int(h_) for h_ in h.split()]) for h in x]
    dict_output['state_dwz_infer'] = state_dwz_hdp

    if flag_coherence == 1:
        dict_gs = corpora.Dictionary(texts)
        all_terms = list(dict_gs.iterkeys())
        V = len(all_terms)
        D = len(texts)
        n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz_hdp, D, V, k_hdp)
        dict_output['coherence'] = topic_cherence_C(n_wd_, n_wj_)

    # ##############
    # # infer p_wt

    all_word_list = [i[1] for i in state_dwz_hdp]
    n_w = max(all_word_list) + 1

    num_k = k_hdp
    p_wt_infer = np.zeros([n_w, num_k])
    for i in state_dwz_hdp:
        tmp_w = i[1]
        tmp_t = i[2]
        p_wt_infer[tmp_w, tmp_t] += 1
    p_wt_infer = p_wt_infer / p_wt_infer.sum(axis=0)
    dict_output['p_wt_infer'] = p_wt_infer

    # # Get the nmi for token_labeling
    if state_dwz_true is not None:
        dict_output_token_labeling = get_dict_output_token_labeling(
            state_dwz_true, state_dwz_hdp, k_true, k_hdp)
        dict_output.update(dict_output_token_labeling)

    os.system('rm -rf %s' % (train_dir))

    return dict_output
Пример #8
0
def topic_coherence_from_state_dwz(state_dwz,input_k,n=10, eps=1.0):
    D = len(set([dwz[0] for dwz in state_dwz])) ## number of documents
    V = len(set([dwz[1] for dwz in state_dwz])) 
    n_wd_, n_wj_, n_jd_ = state_nwjd(state_dwz, D, V, input_k)
    C = topic_cherence_C(n_wd_, n_wj_, n=n,eps=eps)
    return np.mean(C)