예제 #1
0
    def lda_model(self, text_data, save_path, topic_number=20):
        mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=topic_number)
        index=0
        for doc in text_data:
            print(str(index) + " : " + str(doc))
            mdl.add_doc(doc)
            index+=1


        mdl.burn_in = 100
        mdl.train(0)
        print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
        print('Removed top words:', mdl.removed_top_words)
        print('Training...', file=sys.stderr, flush=True)
        for i in range(0, 1500, 10):
            mdl.train(10)
            print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

        print('Saving...', file=sys.stderr, flush=True)
        mdl.save(save_path, True)

        # extract candidates for auto topic labeling
        extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
        cands = extractor.extract(mdl)

        # ranking the candidates of labels for a specific topic
        labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
        for k in range(mdl.k):
            print("== Topic #{} ==".format(k))
            print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
            for word, prob in mdl.get_topic_words(k, top_n=10):
                print(word, prob, sep='\t')
            print()

        return mdl
예제 #2
0
def infer_new_doc():
    '''
    Prior to version 0.10.0, we had to make instances of `Document` using `make_doc` first
    and call `infer`.
    '''
    train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(),
                                   stopwords=['.'])
    train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8'))

    # make LDA model and train
    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs),
          ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    mdl.summary()

    docs = []
    for line in open('enwiki-stemmed-1000.txt', encoding='utf-8'):
        docs.append(mdl.make_doc(line.lower().split()))

    topic_distributions, ll = mdl.infer(docs)

    # print topic distributions of each document
    for doc, topic_dist in zip(docs, topic_distributions):
        #print(doc)
        print(topic_dist)
예제 #3
0
def word_prior_example(input_file):
    corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(),
                             stopwords=['.'])
    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
    corpus.process(open(input_file, encoding='utf-8'))

    # make LDA model and train
    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
    # The word 'church' is assigned to Topic 0 with a weight of 1.0 and to the remaining topics with a weight of 0.1.
    # Therefore, a topic related to 'church' can be fixed at Topic 0 .
    mdl.set_word_prior('church', [1.0 if k == 0 else 0.1 for k in range(20)])
    # Topic 1 for a topic related to 'softwar'
    mdl.set_word_prior('softwar', [1.0 if k == 1 else 0.1 for k in range(20)])
    # Topic 2 for a topic related to 'citi'
    mdl.set_word_prior('citi', [1.0 if k == 2 else 0.1 for k in range(20)])
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs,
          ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    for k in range(mdl.k):
        print("== Topic #{} ==".format(k))
        for word, prob in mdl.get_topic_words(k, top_n=10):
            print(word, prob, sep='\t')
        print()
예제 #4
0
def raw_corpus_and_labeling_example(input_file):
    from nltk.stem.porter import PorterStemmer
    from nltk.corpus import stopwords
    stemmer = PorterStemmer()
    stops = set(stopwords.words('english'))
    corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem), 
        stopwords=lambda x: len(x) <= 2 or x in stops)
    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
    corpus.process(open(input_file, encoding='utf-8'))

    # make LDA model and train
    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=corpus)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))
    
    mdl.summary()
    
    # extract candidates for auto topic labeling
    extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
    cands = extractor.extract(mdl)

    labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
    for k in range(mdl.k):
        print("== Topic #{} ==".format(k))
        print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
        for word, prob in mdl.get_topic_words(k, top_n=10):
            print(word, prob, sep='\t')
        print()
예제 #5
0
def infer_new_corpus():
    '''
    Since 0.10.0 version, inference using an instance of `Corpus` was supported.
    '''

    train_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(),
                                   stopwords=['.'])
    train_corpus.process(open('enwiki-stemmed-1000.txt', encoding='utf-8'))

    test_corpus = tp.utils.Corpus(tokenizer=tp.utils.SimpleTokenizer(),
                                  stopwords=['.'])
    test_corpus.process(open('corpus_to_be_inferred.txt', encoding='utf-8'))

    # make LDA model and train
    mdl = tp.LDAModel(k=20, min_cf=10, min_df=5, corpus=train_corpus)
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs),
          ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    mdl.summary()

    inferred_corpus, ll = mdl.infer(test_corpus)

    # print topic distributions of each document
    for doc in inferred_corpus:
        #print(doc.raw) # print raw string of the document
        #print(list(doc)) # print a list of words within the document
        print(doc.get_topic_dist())
예제 #6
0
def tp_one_trial(dataset, model_type, topic_size, sample_size, min_cf=3, rm_top=5,
             max_iter=1000, min_iter=None, checkpoint=None, stop_increase=1, metric='ll'):
    assert model_type in ['lda', 'ctm', 'slda', 'hdp'], f'invalid `model_type`: {model_type}...'
    assert metric in ['ll', 'pp'], f'invalid `metric`: {metric}...'
    if model_type == 'lda':
        model = tp.LDAModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'ctm':
        model = tp.CTModel(k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == "slda":
        model = tp.SLDAModel(k=topic_size,vars="b", tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    if model_type == 'hdp':
        model = tp.HDPModel(initial_k=topic_size, tw=tp.TermWeight.ONE, min_cf=min_cf, rm_top=rm_top)
    sample_size = min(sample_size, len(dataset))
    
#     max_iter = max_iter * sample_size * topic_size // 2000  # ensure the number of iterations increases with the size of sample
    model.burn_in = max_iter // 5  # set burn-in: 20 percent of max iterations

    for i in range(sample_size):
        doc, label = dataset[i]
        if model_type == "slda":
            model.add_doc(doc,[float(label),])
        else:
            model.add_doc(doc)

    if min_iter is None:
        min_iter = max_iter // 5
    if checkpoint is None:
        checkpoint = max_iter // 5

    model.train(min_iter)

    pre_metric = - np.infty
    stop_increase_cnt = 0.
    cur_metric = 0.
    for i in range(1, max_iter+1):
        model.train(1)
        # Metric is always larger, better
        if metric == 'll':
            cur_metric += model.ll_per_word
        if metric == 'pp':
            cur_metric += - model.perplexity  # smaller perplexity is better.

        if i % checkpoint == 0:
            cur_metric /= checkpoint
            print(f'Current loss: {cur_metric:.5f}')
            if cur_metric >= pre_metric:
                pre_metric = cur_metric
            else:
                stop_increase_cnt += 1
            cur_metric = 0.

        if stop_increase_cnt >= stop_increase:
            break

    final_metric = model.perplexity if metric == 'pp' else model.ll_per_word

    print(f'Trial iterations: {i + min_iter}.')
    return model, final_metric
예제 #7
0
    def train_model(self, dataset, hyperparameters):

        if self.use_partitions:
            X_train, X_test = dataset.get_partitioned_corpus(
                use_validation=False)
        else:
            X_train = dataset.get_corpus()
            X_test = None

        mdl = tp.LDAModel(k=self.hyperparameters['num_topics'],
                          alpha=self.hyperparameters['alpha'],
                          eta=self.hyperparameters['eta'],
                          min_cf=self.hyperparameters['min_cf'],
                          min_df=self.hyperparameters['min_df'],
                          rm_top=self.hyperparameters['rm_top'])

        for i in X_train:
            mdl.add_doc(i)

        mdl.train(self.hyperparameters['max_iter'])

        topic_word_matrix = np.stack([
            mdl.get_topic_word_dist(k) for k in range(mdl.k)
        ])  # topic word distribution matrix
        topic_document_matrix = np.stack([
            doc.get_topic_dist() for doc in mdl.docs
        ])  # topic document distribution matrix

        # topics extraction
        topic_w = []
        for k in range(mdl.k):
            topics = []
            for word in mdl.get_topic_words(k):
                topics.append(word[0])
            topic_w.append(topics)

        # Output model on the Train Set
        info = {}
        info['topics'] = np.asarray(topic_w)
        info['topic-word-matrix'] = topic_word_matrix
        info['topic-document-matrix'] = topic_document_matrix

        # Inference on the test set
        if X_test is not None:
            doc_inst = [mdl.make_doc(i) for i in X_test]
            topic_dist, _ = mdl.infer(doc_inst)  # topic document distribution
            info['test-topic-document-matrix'] = np.asarray(topic_dist)

        # Manage the model output

        info_diz = {}
        info_diz['topics'] = info['topics']
        info_diz['topic-document-matrix'] = info['topic-document-matrix']
        if X_test is not None:
            info_diz['test-topic-document-matrix'] = info[
                'test-topic-document-matrix']
        return info_diz
예제 #8
0
def test_coherence():
    mdl = tp.LDAModel(tw=tp.TermWeight.ONE, k=20, min_df=5, rm_top=5)
    for n, line in enumerate(open(curpath + '/sample.txt', encoding='utf-8')):
        ch = line.strip().split()
        mdl.add_doc(ch)
    mdl.train(1000)

    for coh in ('u_mass', 'c_uci', 'c_npmi', 'c_v'):
        coherence = tp.coherence.Coherence(corpus=mdl, coherence=coh)
        print(coherence.get_score())
예제 #9
0
def bench_tomotopy(k, ps, w=0):
    model = tp.LDAModel(k=k)
    for text in open(filename, encoding='utf-8'): model.add_doc(filter(lambda x:x!='.', text.strip().split()))
    #print('Number of vocabs:', len(model.vocabs))

    start_time = time.time()
    model.train(200, workers=w, parallel=ps)
    #for i in range(k): print(model.get_topic_words(i))
    print('K=%d\tW=%d\tTime: %.5g' % (k, w, time.time() - start_time), end='\t')
    print('LL: %g' % model.ll_per_word, flush=True)
예제 #10
0
def create_lda(tw=tp.TermWeight.IDF,
               min_cf=0,
               min_df=5,
               rm_top=0,
               k=2,
               alpha=0.1,
               eta=1,
               seed=101,
               corpus=None):
    """
    Creates a tomotopy LDAModel()
    Parameters:
        tw: Union[int, TermWeight]
            term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ;
            I chose the default to be inverse document frequency, which means that cards that appear in
            almost all decks are weighted lower than cards that appear in very few decks.
        min_cf: int
            Unless I'm mistaken, this is the minimum number of times that a card must appear at all in
            any deck to be included. However, since the vast majority of cards can be included at most
            once, this is almost always going to be the same as min_df.
        min_df: int
            Minimum number of times that a card must appear in a deck to be included in the analysis;
            default is set to 5.
        rm_top: int
            When ranking the most popular cards that are included in a given commander's decks, this
            parameter will remove the top n of them. Default is 0.
        k: int
            Number of themes/archetypes to sort decks into from 1 ~ 32,767. The default value is 2.
        alpha: float
            "hyperparameter of Dirichlet distribution for document-topic". Increasing alpha ... Based
            on advice from Eduardo Coronado (@ecoronado92 on Twitter), default for alpha is set to 0.1.
        eta: float
            "hyperparameter of Dirichlet distribution for topic-word". Increasing eta ... Based on
            experimentation, default for eta is 1.
        seed: int
            Random seed. Set to 101 as default in an attempt to duplicate results; however, said
            duplication has proven to be... elusive.
        corpus: tomotopy Corpus
            A list of documents to be added into the model. If None, documents have to be added
            after the model is created through LDAModel.add_doc() before the model can be trained.
    :return:
        tomotopy LDA model object
    """

    lda = tp.LDAModel(tw=tw,
                      min_cf=min_cf,
                      min_df=min_df,
                      rm_top=rm_top,
                      k=k,
                      alpha=alpha,
                      eta=eta,
                      seed=seed,
                      corpus=corpus)
    return lda
예제 #11
0
def test_uid():
    cps = tp.utils.Corpus()
    cps.add_doc("test text".split(), uid="001")
    cps.add_doc("test text".split(), uid="abc")
    cps.add_doc("test text".split(), uid="0x1f")

    mdl = tp.LDAModel(k=2, corpus=cps)
    assert len(cps) == len(mdl.docs)
    assert cps[0].uid == mdl.docs[0].uid
    print(mdl.docs["001"])
    print(mdl.docs["abc"])
    print(mdl.docs["0x1f"])
예제 #12
0
def train_lda_model_from_data(filtered_data, topics=30):
    mdl = tp.LDAModel(k=topics)
    for data in tqdm.tqdm(filtered_data.keys()):
        mdl.add_doc(chain.from_iterable(filtered_data[data][1]))

    print("Beginning LDA training...")
    for i in range(0, 1000, 10):
        mdl.train(10)
        if (i % 100 == 0):
            print('Iteration: {}\tLog-likelihood: {}'.format(
                i, mdl.ll_per_word))
    print("Finished Training")
    return mdl
예제 #13
0
def train_tomotopy_model(corpus: dict,
                         num_topics: int = 300,
                         verbose: bool = True):

    mdl = tp.LDAModel(k=num_topics)

    corpus_ = {k: v for k, v in corpus.items() if len(v) > 0}

    for doc in list(corpus_.values()):
        mdl.add_doc(doc)

    for i in range(0, 150, 10):
        mdl.train(10)
        print("Iteration: {}".format(i))

    if verbose is True:
        print(mdl.summary())

    return mdl
def tm_estimation(corpus_, k_):
    # estimatation
    tp.ParallelScheme(0)
    model = tp.LDAModel(min_df=5, rm_top=20, k=k_, seed=000, corpus=corpus_)
    model.train(0)
    # log
    print("Num docs:{}, Num Vocabs:{}, Total Words:{}".format(
        len(model.docs), len(model.used_vocabs), model.num_words))
    print("Removed Top words: ", *model.removed_top_words)
    # model training
    for i in range(0, 1000, 20):
        print("Iteration: {:04}, LL per word: {:.4}".format(
            i, model.ll_per_word))
        model.train(20)
    print("Iteration: {:04}, LL per word: {:.4}".format(
        1000, model.ll_per_word))
    # summary
    model.summary()
    # return
    return model
예제 #15
0
파일: example.py 프로젝트: laranea/tomotopy
def lda_example(input_file, save_path):
    mdl = tp.LDAModel(tw=tp.TermWeight.ONE, min_cf=3, rm_top=5, k=20)
    for n, line in enumerate(open(input_file, encoding='utf-8')):
        ch = line.strip().split()
        mdl.add_doc(ch)
    mdl.burn_in = 100
    mdl.train(0)
    print('Num docs:', len(mdl.docs), ', Vocab size:', mdl.num_vocabs, ', Num words:', mdl.num_words)
    print('Removed top words:', mdl.removed_top_words)
    print('Training...', file=sys.stderr, flush=True)
    for i in range(0, 1000, 10):
        mdl.train(10)
        print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

    print('Saving...', file=sys.stderr, flush=True)
    mdl.save(save_path, True)

    for k in range(mdl.k):
        print('Topic #{}'.format(k))
        for word, prob in mdl.get_topic_words(k):
            print('\t', word, prob, sep='\t')
예제 #16
0
def test_docs():
    from nltk.stem.porter import PorterStemmer
    from nltk.corpus import stopwords
    stemmer = PorterStemmer()
    stopwords = set(stopwords.words('english'))
    corpus = tp.utils.Corpus(
        tokenizer=tp.utils.SimpleTokenizer(stemmer=stemmer.stem),
        stopwords=lambda x: len(x) <= 2 or x in stopwords)
    # data_feeder yields a tuple of (raw string, user data) or a str (raw string)
    for i, line in enumerate(
            open(curpath + '/sample_raw.txt', encoding='utf-8')):
        corpus.add_doc(raw=line, uid='doc{:05}'.format(i), etc=len(line))

    def _test_doc(doc, etc=False):
        print("doc", doc)

        print("len(doc)", len(doc))
        print("doc.__getitem__", doc[0], doc[1], doc[2], doc[3])

        if etc: print("doc.etc", doc.etc)
        print("doc.words", doc.words[:10])
        print("doc.span", doc.span[:10])
        print("doc.raw", doc.raw[:10])

    print("len(corpus)", len(corpus))
    print("len(corpus[:10])", len(corpus[:10]))

    _test_doc(corpus[0], etc=True)

    mdl = tp.LDAModel(k=10, corpus=corpus)
    mdl.train(100)
    print("len(mdl.docs)", len(mdl.docs))
    print("len(mdl.docs[:10])", len(mdl.docs[:10]))

    ch = tp.coherence.Coherence(corpus=mdl, coherence='u_mass')
    for k in range(mdl.k):
        print('Coherence of #{} : {}'.format(k, ch.get_score(topic_id=k)))

    _test_doc(mdl.docs[0])
예제 #17
0
# initialize a pipeline
nlp = en_core_web_lg.load()
# process data
docs_tokens, tmp_tokens = [], []
for item in df.loc[:, "Review"].to_list():
    tmp_tokens = [
        token.lemma_ for token in nlp(item)
        if not token.is_stop and not token.is_punct and not token.like_num
    ]
    docs_tokens.append(tmp_tokens)
    tmp_tokens = []

# %% Tomotopy LDA estimation
# create a corpus using tp utilities
corpus = tp.utils.Corpus()
# populate the corpus
for item in docs_tokens:
    corpus.add_doc(words=item)
# estimate a model with 10 topics
lda = tp.LDAModel(k=10, corpus=corpus)
# train the model
for i in range(0, 100, 10):
    lda.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, lda.ll_per_word))
# display words by topics
for k in range(lda.k):
    print("Top 10 words of topic #{}".format(k))
    print(lda.get_topic_words(k, top_n=10))
# save model estimates
lda.save('hotel_review_lda_estimates.bin')
예제 #18
0
    # load if preprocessed corpus exists
    corpus = tp.utils.Corpus.load('preprocessed_20news.cps')
except IOError:
    porter_stemmer = nltk.PorterStemmer().stem
    english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
    pat = re.compile('^[a-z]{2,}$')
    corpus = tp.utils.Corpus(
        tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 
        stopwords=lambda x: x in english_stops or not pat.match(x)
    )
    newsgroups_train = fetch_20newsgroups()
    corpus.process(d.lower() for d in newsgroups_train.data)
    # save preprocessed corpus for reuse
    corpus.save('preprocessed_20news.cps')

mdl = tp.LDAModel(min_df=5, rm_top=40, k=30, corpus=corpus)
mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

# Let's train the model
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))

mdl.summary()
def main():
    """

    """
    ######################
    ### Setup
    ######################
    ## Parse Command Line
    args = parse_arguments()
    ## Load Configuration
    config = Config(filepath=args.config)
    ## Output
    if config.output_dir is not None and not os.path.exists(config.output_dir):
        _ = os.makedirs(config.output_dir)
    ## Cache Config
    if config.output_dir is not None and config.run_id is not None:
        _ = os.system(
            f"cp {args.config} {config.output_dir}/{config.run_id}.config.json"
        )
    ## Set Random State
    if config.random_state is not None:
        np.random.seed(config.random_state)
    ######################
    ### Data Generating Process
    ######################
    ## Generate Data
    X_latent, X, y, D, theta, phi = data_generating_process(
        config.N,
        config.sigma_0,
        config.p_domain,
        config.gamma,
        config.V,
        config.theta,
        config.coef,
        beta=config.beta,
        random_state=config.random_state)
    ## Data Distribution Plot
    if args.make_plots:
        fig, ax = fit_latent_regression(X_latent, y, D, config.coef)
        plt.show()
    ######################
    ### Fit Topic Models
    ######################
    ## Split Data into Training and Test
    train_ind = list(range(int(config.N * .8)))
    test_ind = list(range(int(config.N * .8), config.N))
    ## Generate Corpus
    train_corpus = tp.utils.Corpus()
    full_corpus = tp.utils.Corpus()
    ## Add Training Data
    for n in range(X.shape[0]):
        doc_n = doc_to_str(X[n])
        full_corpus.add_doc(doc_n, label=[str(D[n])])
        if n <= train_ind[-1]:
            train_corpus.add_doc(doc_n, label=[str(D[n])])
    assert len(train_corpus) == len(train_ind)
    ## Initialize Models (3 Topics Total)
    lda = tp.LDAModel(k=3,
                      corpus=train_corpus,
                      seed=config.random_state if config.random_state
                      is not None else np.random.randint(1e6))
    plda = tp.PLDAModel(latent_topics=1,
                        topics_per_label=1,
                        corpus=train_corpus,
                        seed=config.random_state if config.random_state
                        is not None else np.random.randint(1e6))
    ## Initialize Sampler
    lda.train(1)
    plda.train(1)
    ## Update Parameters based on Corpus
    V_nn = lda.num_vocabs
    ## MCMC Storage
    n_iter = max(config.n_iter_lda, config.n_iter_plda)
    likelihood = np.zeros((n_iter, 2)) * np.nan
    theta_lda = np.zeros((n_iter, config.N, 3)) * np.nan
    theta_plda = np.zeros((n_iter, config.N, 3)) * np.nan
    phi_lda = np.zeros((n_iter, 3, V_nn)) * np.nan
    phi_plda = np.zeros((n_iter, 3, V_nn)) * np.nan
    ## Word Count
    train_word_n = sum([len(d.words) for d in full_corpus[train_ind]])
    test_word_n = sum([len(d.words) for d in full_corpus[test_ind]])
    ## Train LDA Model
    for epoch in tqdm(range(config.n_iter_lda), desc="LDA Training"):
        lda.train(1)
        train_inf, train_ll = lda.infer(full_corpus[train_ind],
                                        iter=config.n_sample)
        test_inf, test_ll = lda.infer(full_corpus[test_ind],
                                      iter=config.n_sample)
        likelihood[epoch, 0] = train_ll.sum() / train_word_n
        theta_lda[epoch] = np.vstack(
            flatten([[d.get_topic_dist() for d in inf]
                     for inf in [train_inf, test_inf]]))
        phi_lda[epoch] = np.vstack(
            [lda.get_topic_word_dist(t) for t in range(lda.k)])
    ## Train PLDA Model
    for epoch in tqdm(range(config.n_iter_plda), desc="PLDA Training"):
        plda.train(1)
        train_inf, train_ll = plda.infer(full_corpus[train_ind],
                                         iter=config.n_sample)
        test_inf, test_ll = plda.infer(full_corpus[test_ind],
                                       iter=config.n_sample)
        likelihood[epoch, 1] = train_ll.sum() / train_word_n
        theta_plda[epoch] = np.vstack(
            flatten([[d.get_topic_dist() for d in inf]
                     for inf in [train_inf, test_inf]]))
        phi_plda[epoch] = np.vstack(
            [plda.get_topic_word_dist(t) for t in range(plda.k)])
    ## Plot Likelihood
    if args.make_plots:
        plt.figure(figsize=(10, 5.8))
        plt.plot(likelihood[:, 0], label="LDA")
        plt.plot(likelihood[:, 1], label="PLDA")
        plt.xlabel("Training Epoch", fontweight="bold")
        plt.ylabel("Log Likelihood Per Word", fontweight="bold")
        plt.legend(loc="lower right")
        plt.tight_layout()
        plt.show()
    ## Plot Traces for Phi
    if args.make_plots:
        fig, axes = plt.subplots(phi_lda.shape[1], 2, figsize=(10, 5.8))
        for m, (mphi,
                mdl) in enumerate(zip([phi_lda, phi_plda], ["LDA", "PLDA"])):
            ax = axes[:, m]
            for k in range(mphi.shape[1]):
                ax[k].plot(mphi[:, k, :])
                ax[k].set_ylabel("Parameter Value", fontweight="bold")
                ax[k].spines["top"].set_visible(False)
                ax[k].spines["right"].set_visible(False)
            ax[k].set_xlabel("Training Epoch", fontweight="bold")
            ax[0].set_title(f"{mdl} $\\phi$ Trace", fontweight="bold")
        fig.tight_layout()
        plt.show()
    ## Plot Sample Traces for Theta
    if args.make_plots:
        fig, ax = plt.subplots(5, 2, sharex=False, figsize=(10, 5.8))
        for d, doc in enumerate(
                sorted(np.random.choice(config.N, 5, replace=False))):
            ax[d, 0].plot(theta_lda[:, doc, :])
            ax[d, 1].plot(theta_plda[:, doc, :])
            for i in range(2):
                ax[d, i].spines["right"].set_visible(False)
                ax[d, i].spines["top"].set_visible(False)
                ax[d, i].set_title(f"Document {doc}",
                                   loc="left",
                                   fontstyle="italic")
                ax[d, i].set_ylabel("$\\theta$")
        for m, mdl in enumerate(["LDA", "PLDA"]):
            ax[-1, m].set_xlabel(f"{mdl} Training Epoch", fontweight="bold")
        fig.tight_layout()
        plt.show()
    ## Get Final Representations
    X_latent_lda = np.vstack([
        d.get_topic_dist() for d in lda.infer(
            full_corpus, iter=config.n_sample, together=False)[0]
    ])
    X_latent_plda = np.vstack([
        d.get_topic_dist() for d in plda.infer(
            full_corpus, iter=config.n_sample, together=False)[0]
    ])
    ## Isolate Latent Variables and Normalize
    X_latent_plda = X_latent_plda[:, -plda.latent_topics:]
    ## Fit Classifiers
    source_train_ind = sorted(set(train_ind) & set(np.where(D == 0)[0]))
    lr_lda = LogisticRegression()
    lr_lda.fit(X_latent_lda[source_train_ind], y[source_train_ind])
    lr_plda = LogisticRegression()
    lr_plda.fit(X_latent_plda[source_train_ind], y[source_train_ind])
    ## Make Test Predictions
    y_test_lda = lr_lda.predict_proba(X_latent_lda)[:, 1]
    y_test_plda = lr_plda.predict_proba(X_latent_plda)[:, 1]
    ## Score Predictions
    scores = score_model(y, y_test_lda, y_test_plda, D, test_ind, True)
    if config.output_dir is not None and config.run_id is not None:
        with open(f"{config.output_dir}/{config.run_id}.scores.json",
                  "w") as the_file:
            json.dump(scores, the_file)
예제 #20
0
def create_model(k, alpha, eta):
    return tp.LDAModel(k=k, rm_top=20, alpha=alpha,
                       eta=eta), 'lda_{}'.format(k)
예제 #21
0
def load_model(input, model_name):
    return tp.LDAModel().load('{}{}'.format(input, model_name))
def main():
    """

    """
    ###################
    ### Script Setup
    ###################
    ## Parse Command Line
    args = parse_arguments()
    ## Load Configuration
    config = Config(filepath=args.config)
    ## Check Sampler
    if not valid_sampler(config):
        raise ValueError("Configuration results in no inferences. Change burn-in or sample frequency.")
    ## Create Output Directories
    basedir = f"{config.output_dir}/" if args.fold is None else f"{config.output_dir}/fold-{args.fold}/".replace("//","/")
    dirs = ["topic_model/document_topic/","topic_model/topic_word/","classification/"]
    for d in dirs:
        ddir = f"{basedir}{d}"
        if not os.path.exists(ddir):
            _ = os.makedirs(ddir)
    ## Cache Configuration
    _ = os.system(f"cp {args.config} {config.output_dir}/config.json")
    ## Set Random Seed
    if config.random_seed is not None:
        np.random.seed(config.random_seed)
    ###################
    ### Data Preparation
    ###################
    ## Load Data
    LOGGER.info("Loading Processed Datasets")
    X_source, y_source, splits_source, filenames_source, users_source, terms_source = load_data(f"{DEPRESSION_DATA_DIR}{config.source}/")
    X_target, y_target, splits_target, filenames_target, users_target, terms_target = load_data(f"{DEPRESSION_DATA_DIR}{config.target}/")
    ## Align Vocabulary Spaces
    LOGGER.info("Aligning Vocabularies")
    X_source, X_target, vocab = align_data(X_source, X_target, terms_source, terms_target, config.vocab_alignment)
    ## Split Data
    LOGGER.info("Separating Datasets by Split")
    Xs_train, Xs_dev, Xs_test, ys_train, ys_dev, ys_test = split_data(X_source, y_source, splits_source)
    Xt_train, Xt_dev, Xt_test, yt_train, yt_dev, yt_test = split_data(X_target, y_target, splits_target)
    ## Sampling 
    LOGGER.info("Sampling Source Data")
    Xs_train, ys_train = sample_data(Xs_train, ys_train, config.source_class_ratio.get("train"), config.source_sample_size.get("train"))
    Xs_dev, ys_dev = sample_data(Xs_dev, ys_dev, config.source_class_ratio.get("dev"), config.source_sample_size.get("dev"))
    Xs_test, ys_test = sample_data(Xs_test, ys_test, config.source_class_ratio.get("test"), config.source_sample_size.get("test"))
    LOGGER.info("Sampling Target Data")
    Xt_train, yt_train = sample_data(Xt_train, yt_train, config.target_class_ratio.get("train"), config.target_sample_size.get("train"))
    Xt_dev, yt_dev = sample_data(Xt_dev, yt_dev, config.target_class_ratio.get("dev"), config.target_sample_size.get("dev"))
    Xt_test, yt_test = sample_data(Xt_test, yt_test, config.target_class_ratio.get("test"), config.target_sample_size.get("test"))
    ## Cross Validation
    if args.fold is not None:
        LOGGER.info(f"Isolating K-Fold Data (Fold {args.fold})")
        ## Initialize Splitter
        splitter = StratifiedKFold(n_splits=args.k_folds,
                                   shuffle=True,
                                   random_state=config.random_seed)
        ## Merge Data
        Xs_all = sparse.vstack([Xs_train, Xs_dev])
        Xt_all = sparse.vstack([Xt_train, Xt_dev])
        ys_all = np.hstack([ys_train,ys_dev])
        yt_all = np.hstack([yt_train,yt_dev])
        ## Get Train and Dev Splits for the Fold
        splits_source = list(splitter.split(Xs_all, ys_all))[args.fold-1]
        splits_target = list(splitter.split(Xt_all, yt_all))[args.fold-1]
        ## Isolate Relevant Data
        Xs_train, ys_train = Xs_all[splits_source[0]], ys_all[splits_source[0]]
        Xs_dev, ys_dev = Xs_all[splits_source[1]], ys_all[splits_source[1]]
        Xt_train, yt_train = Xt_all[splits_target[0]], yt_all[splits_target[0]]
        Xt_dev, yt_dev = Xt_all[splits_target[1]], yt_all[splits_target[1]]
    ###################
    ### Corpus Generation
    ###################
    ## Sample Topic Model Training Masks
    if config.topic_model_data.get("source") is not None:
        if config.topic_model_data.get("source") > Xs_train.shape[0]:
            LOGGER.warning("Requested Source Topic Model Train Size Greater than Available Data. Downsizing.")
            config.topic_model_data["source"] = Xs_train.shape[0]
        source_mask = sorted(np.random.choice(Xs_train.shape[0], size=config.topic_model_data.get("source"), replace=False))
    else:
        source_mask = list(range(Xs_train.shape[0]))
    if config.topic_model_data.get("target") is not None:
        if config.topic_model_data.get("target") > Xt_train.shape[0]:
            LOGGER.warning("Requested Target Topic Model Train Size Greater than Available Data. Downsizing.")
            config.topic_model_data["target"] = Xt_train.shape[0]
        target_mask = sorted(np.random.choice(Xt_train.shape[0], size=config.topic_model_data.get("target"), replace=False))
    else:
        target_mask = list(range(Xt_train.shape[0]))
    ## Initialize Corpus
    LOGGER.info("Generating Training Corpus (Topic-Model Learning)")
    train_corpus, train_missing = generate_corpus(Xs_train, label="source", mask=source_mask, num_jobs=args.num_jobs)
    train_corpus, train_missing = generate_corpus(Xt_train, label="target", mask=target_mask, corpus=train_corpus, missing=train_missing, num_jobs=args.num_jobs)
    LOGGER.info("Generating Training Corpus (Inference)")
    if config.topic_model_data.get("source") is None and config.topic_model_data.get("target") is None:
        LOGGER.info("Using Training Corpus for Inference")
        train_corpus_infer = train_corpus
        train_missing_infer = train_missing
    else:
        train_corpus_infer, train_missing_infer = generate_corpus(Xs_train, label="source", missing={}, num_jobs=args.num_jobs)
        train_corpus_infer, train_missing_infer = generate_corpus(Xt_train, label="target", corpus=train_corpus_infer, missing=train_missing_infer, num_jobs=args.num_jobs)
    LOGGER.info("Generating Development Corpus (Inference)")
    development_corpus, dev_missing = generate_corpus(Xs_dev, label="source", missing={}, num_jobs=args.num_jobs)
    development_corpus, dev_missing = generate_corpus(Xt_dev, label="target", corpus=development_corpus, missing=dev_missing, num_jobs=args.num_jobs)
    if args.evaluate_test:
        LOGGER.info("Generating Test Corpus (Inference)")
        test_corpus, test_missing = generate_corpus(Xs_test, label="source", missing={}, num_jobs=args.num_jobs)
        test_corpus, test_missing = generate_corpus(Xt_test, label="target", corpus=test_corpus, missing=test_missing, num_jobs=args.num_jobs)
    ###################
    ### Topic Model (Training)
    ###################
    ## Initialize Model
    if config.use_plda:
        model = tp.PLDAModel(alpha=config.alpha,
                             eta=config.beta,
                             latent_topics=config.k_latent,
                             topics_per_label=config.k_per_label,
                             min_df=config.min_doc_freq,
                             rm_top=config.rm_top,
                             corpus=train_corpus,
                             seed=config.random_seed)
    else:
        model = tp.LDAModel(alpha=config.alpha,
                            eta=config.beta,
                            k=config.k_latent,
                            min_df=config.min_doc_freq,
                            rm_top=config.rm_top,
                            corpus=train_corpus,
                            seed=config.random_seed)
    ## Initialize Sampler
    model.train(1, workers=args.num_jobs)
    ## Corpus-Updated Parameters
    V = model.num_vocabs
    N = len(model.docs)
    N_train = len(train_corpus_infer)
    N_dev = len(development_corpus)
    N_test = len(test_corpus) if args.evaluate_test else 0
    K = model.k
    ## Gibbs Cache
    ll = np.zeros(config.n_iter)
    theta_train = []
    theta_dev = []
    theta_test = [] if args.evaluate_test else None
    if args.cache_parameters:
        phi = np.zeros((config.n_iter, K, V))
        theta = np.zeros((config.n_iter, N, K))
    else:
        phi = np.zeros((K,V))
        theta = np.zeros((N, K))
    ## Train Model
    for epoch in tqdm(range(0, config.n_iter), desc="MCMC Iteration", file=sys.stdout):
        ## Run Sample Epoch
        model.train(1, workers=args.num_jobs)
        ## Examine Data Fit
        ll[epoch] = model.ll_per_word
        ## Cache Model Parameters
        if args.cache_parameters:
            phi[epoch] = np.vstack([model.get_topic_word_dist(i) for i in range(K)])
            theta[epoch] = np.vstack([d.get_topic_dist() for d in model.docs])
        elif epoch == (config.n_iter - 1):
            phi = np.vstack([model.get_topic_word_dist(i) for i in range(K)])
            theta = np.vstack([d.get_topic_dist() for d in model.docs])
        ## Make Inferences Regularly
        if (epoch + 1) >= config.n_burn and (epoch + 1) % config.infer_sample_rate == 0:
            ## Training Inference
            train_dist, _ = model.infer(train_corpus_infer, iter=config.n_sample, together=False)
            theta_train.append(np.vstack([t.get_topic_dist() for t in train_dist]))
            ## Development Inference
            dev_dist, _ = model.infer(development_corpus, iter=config.n_sample, together=False)
            theta_dev.append(np.vstack([d.get_topic_dist() for d in dev_dist]))
            ## Test Inference
            if args.evaluate_test:
                test_dist, _ = model.infer(test_corpus, iter=config.n_sample, together=False)
                theta_test.append(np.vstack([t.get_topic_dist() for t in test_dist]))
    ## Stack Inferences
    theta_train = np.stack(theta_train)
    theta_dev = np.stack(theta_dev)
    if args.evaluate_test:
        theta_test = np.stack(theta_test)
    ## Cache Model Summary
    _ = model.summary(topic_word_top_n=20, file=open(f"{basedir}/topic_model/model_summary.txt","w"))
    ################
    ### Topic Model Diagnostics
    ################
    ## Plot Likelihood
    fig, ax = plt.subplots(figsize=(10,5.8))
    ax.plot(ll)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.set_xlabel("MCMC Iteration", fontweight="bold")
    ax.set_ylabel("Log-Likelihood Per Word", fontweight="bold")
    fig.tight_layout()
    fig.savefig(f"{basedir}/topic_model/log_likelihood_train{args.plot_fmt}",dpi=300)
    plt.close(fig)
    ## Evaluate Topics
    for k in range(model.k):
        top_terms = [i[0] for i in model.get_topic_words(k, top_n=20)]
        if config.use_plda:
            LOGGER.info("{}: {}".format(which_plda_topic(k, model), ", ".join(top_terms)))
        else:
            LOGGER.info("{}: {}".format(k, ", ".join(top_terms)))
    ## Show Average Topic Distribution (Training Data)
    LOGGER.info("Plotting Average Topic Distributions")
    try:
        fig, ax = plot_average_topic_distribution(theta=theta_train,
                                                  model=model,
                                                  use_plda=config.use_plda,
                                                  n_burn=0)
        fig.savefig(f"{basedir}/topic_model/average_topic_distribution_train{args.plot_fmt}",dpi=300)
        plt.close(fig)
    except:
        pass
    ## Show Average Topic Distribution (Development Data)
    try:
        fig, ax = plot_average_topic_distribution(theta=theta_dev,
                                                  model=model,
                                                  use_plda=config.use_plda,
                                                  n_burn=0)
        fig.savefig(f"{basedir}/topic_model/average_topic_distribution_development{args.plot_fmt}",dpi=300)
        plt.close(fig)
    except:
        pass
    ## Show Trace for a Document Topic Distribution (Random Sample)
    if args.plot_document_topic:
        LOGGER.info("Plotting Sample of Document Topic Distributions")
        for doc_n in np.random.choice(theta_train.shape[1], 10):
            try:
                fig, ax = plot_document_topic_distribution(doc=doc_n,
                                                           theta=theta)
                fig.savefig(f"{basedir}/topic_model/document_topic/train_{doc_n}{args.plot_fmt}",dpi=300)
                plt.close(fig)
            except:
                pass
    ## Show Trace for a Topic Word Distribution
    if args.plot_topic_word:
        LOGGER.info("Plotting Topic Word Distributions")
        for topic in tqdm(range(K), total=K, desc="Topic Word Distribution", file=sys.stdout):
            try:
                fig, ax = plot_topic_word_distribution(topic=topic,
                                                       phi=phi,
                                                       model=model,
                                                       use_plda=config.use_plda,
                                                       n_trace=30,
                                                       n_top=30,
                                                       n_burn=config.n_burn if config.n_burn < phi.shape[0] else -100)
                fig.savefig(f"{basedir}/topic_model/topic_word/topic_{topic}{args.plot_fmt}",dpi=300)
                plt.close(fig)
            except:
                pass
    ################
    ### Depression Classifier Training
    ################
    LOGGER.info("Beginning Classifier Training Procedure")
    ## Isolate General Latent Representations
    theta_train_latent = theta_train[:,:,-config.k_latent:]
    theta_dev_latent = theta_dev[:,:,-config.k_latent:]
    if args.evaluate_test:
        theta_test_latent = theta_test[:,:,-config.k_latent:]
    ## Get Ground Truth Labels
    y_train = np.array(
        [j for i, j in enumerate(ys_train) if i not in train_missing_infer.get("source")] + \
        [j for i, j in enumerate(yt_train) if i not in train_missing_infer.get("target")]
    )
    y_dev = np.array(
        [j for i, j in enumerate(ys_dev) if i not in dev_missing.get("source")] + \
        [j for i, j in enumerate(yt_dev) if i not in dev_missing.get("target")]
    )
    if args.evaluate_test:
        y_test = np.array(
        [j for i, j in enumerate(ys_test) if i not in test_missing.get("source")] + \
        [j for i, j in enumerate(yt_test) if i not in test_missing.get("target")]
    )
    ## Domain Masks
    source_train_ind = list(range(Xs_train.shape[0] - len(train_missing_infer.get("source"))))
    target_train_ind = list(range(len(source_train_ind), y_train.shape[0]))
    source_dev_ind = list(range(Xs_dev.shape[0] - len(dev_missing.get("source"))))
    target_dev_ind = list(range(len(source_dev_ind), y_dev.shape[0]))
    if args.evaluate_test:
        source_test_ind = list(range(Xs_test.shape[0] - len(test_missing.get("source"))))
        target_test_ind = list(range(len(source_test_ind), y_test.shape[0]))
    ## Separate Training Labels
    y_train_s = y_train[source_train_ind]
    y_train_t = y_train[target_train_ind]
    y_dev_s = y_dev[source_dev_ind]
    y_dev_t = y_dev[target_dev_ind]
    if args.evaluate_test:
        y_test_s = y_test[source_test_ind]
        y_test_t = y_test[target_test_ind]
    ## Caching
    if args.cache_predictions:
        ## Labels
        _ = np.save(f"{basedir}/classification/labels.train.npy", y_train)
        _ = np.save(f"{basedir}/classification/labels.dev.npy", y_dev)
        ## Indices
        _ = np.save(f"{basedir}/classification/source.train.npy", source_train_ind)
        _ = np.save(f"{basedir}/classification/target.train.npy", target_train_ind)
        _ = np.save(f"{basedir}/classification/source.dev.npy", source_dev_ind)
        _ = np.save(f"{basedir}/classification/target.dev.npy", target_dev_ind)
        if args.evaluate_test:
            ## Labels
            _ = np.save(f"{basedir}/classification/labels.test.npy", y_test)
            ## Indices
            _ = np.save(f"{basedir}/classification/source.test.npy", source_test_ind)
            _ = np.save(f"{basedir}/classification/target.test.npy", target_test_ind)
    ## Cycle Through Types of Preprocessing, Training, and Inference
    all_scores = []
    for C in config.C:
        for average_representation in config.averaging:
            for norm in config.norm:
                LOGGER.info("Feature Set: Average Representation ({}), Norm ({}), Regularization ({})".format(average_representation, norm, C))
                if average_representation:
                    ## Average
                    X_train = theta_train_latent.mean(axis=0)
                    X_dev = theta_dev_latent.mean(axis=0)
                    if args.evaluate_test:
                        X_test = theta_test_latent.mean(axis=0)
                    ## Normalization (If Desired)
                    if norm:
                        X_train = normalize(X_train, norm=norm, axis=1)
                        X_dev = normalize(X_dev, norm=norm, axis=1)
                        if args.evaluate_test:
                            X_test = normalize(X_test, norm=norm, axis=1)
                    ## Reshape Data
                    X_train = X_train.reshape((1,X_train.shape[0],X_train.shape[1]))
                    X_dev = X_dev.reshape((1,X_dev.shape[0], X_dev.shape[1]))
                    if args.evaluate_test:
                        X_test =  X_test.reshape((1,X_test.shape[0], X_test.shape[1]))
                else:
                    ## Remove Burn In
                    X_train = theta_train_latent.copy()
                    X_dev = theta_dev_latent.copy()
                    if args.evaluate_test:
                        X_test = theta_test_latent.copy()
                    ## Normalization (If Desired)
                    if norm:
                        X_train = np.stack([normalize(x, norm=norm, axis=1) for x in X_train])
                        X_dev = np.stack([normalize(x, norm=norm, axis=1) for x in X_dev])
                        if args.evaluate_test:
                            X_test = np.stack([normalize(x, norm=norm, axis=1) for x in X_test])
                ## Training
                models = []
                for x in tqdm(X_train, desc="Fitting Models", file=sys.stdout):
                    ## Fit Classifier
                    logit = LogisticRegression(C=C,
                                              random_state=42,
                                              max_iter=config.max_iter,
                                              solver='lbfgs')
                    logit.fit(x[source_train_ind], y_train[source_train_ind])
                    ## Get Predictions
                    models.append(logit)
                ## Inference
                y_pred_train = np.zeros((len(models), X_train.shape[0], y_train.shape[0]))
                y_pred_dev = np.zeros((len(models), X_dev.shape[0], y_dev.shape[0]))
                if args.evaluate_test:
                    y_pred_test = np.zeros((len(models), X_test.shape[0], y_test.shape[0]))
                for m, mdl in tqdm(enumerate(models), position=0, desc="Making Predictions", total=len(models), file=sys.stdout):
                    y_pred_train[m] = mdl.predict_proba(X_train[m])[:,1]
                    y_pred_dev[m] = mdl.predict_proba(X_dev[m])[:,1]
                    if args.evaluate_test:
                        y_pred_test[m] = mdl.predict_proba(X_test[m])[:,1]
                ## Cache Predictions
                if args.cache_predictions:
                    ## Predictions
                    _ = np.save(f"{basedir}/classification/predictions.train.{C}.{average_representation}.{norm}.npy", y_pred_train)
                    _ = np.save(f"{basedir}/classification/predictions.dev.{C}.{average_representation}.{norm}.npy", y_pred_dev)
                    if args.evaluate_test:
                        _ = np.save(f"{basedir}/classification/predictions.{C}.{average_representation}.{norm}.dev.npy", y_pred_test)
                ## Learn Optimal Thresholds (Youden's J-Score)
                thresholds = {}
                for m, mdl_pred in tqdm(enumerate(y_pred_dev), total=y_pred_dev.shape[0], desc="Learning Binarization Thresholds", file=sys.stdout):
                    for l, latent_pred in enumerate(mdl_pred):
                        for d, dind in enumerate([source_dev_ind, target_dev_ind]):
                            if args.learn_threshold:
                                d_l_pred = latent_pred[dind]
                                d_l_true = y_dev[dind]
                                if len(d_l_pred) == 0:
                                    continue
                                fpr, tpr, t = metrics.roc_curve(d_l_true, d_l_pred, drop_intermediate=False)
                                j_scores = tpr - fpr
                                j_ordered = sorted(zip(j_scores, t))
                                j_opt_thresh = j_ordered[-1][1]
                                thresholds[(m,l,d)] = j_opt_thresh
                            else:
                                thresholds[(m,l,d)] = 0.5
                ## ROC Curves
                LOGGER.info("Plotting ROC/AUC and Scoring Training/Development Predictions")
                auc_scores = [[[],[]],[[],[]]]
                fig, ax = plt.subplots(2, 2, figsize=(10,5.8), sharex=True, sharey=True)
                for m, mdl_pred in tqdm(enumerate(y_pred_train), total=y_pred_train.shape[0], desc="Train Scoring", file=sys.stdout):
                    for l, latent_pred in enumerate(mdl_pred):
                        for d, dind in enumerate([source_train_ind, target_train_ind]):
                            d_l_pred = latent_pred[dind]
                            d_l_true = y_train[dind]
                            if len(d_l_pred) == 0:
                                continue
                            tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)])
                            auc_scores[d][0].append(dl_scores.get("auc",0))
                            dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"train","threshold":thresholds[(m,l,d)]})
                            dl_scores.update({"norm":norm, "is_average_representation":average_representation, "C":C})
                            all_scores.append(dl_scores)
                            ax[d][0].plot(tpr, fpr, alpha=0.01 if not average_representation else .8, color=f"navy", linewidth=0.5 if not average_representation else 1)
                for m, mdl_pred in tqdm(enumerate(y_pred_dev), total=y_pred_dev.shape[0], desc="Development Scoring", file=sys.stdout):
                    for l, latent_pred in enumerate(mdl_pred):
                        for d, dind in enumerate([source_dev_ind, target_dev_ind]):
                            d_l_pred = latent_pred[dind]
                            d_l_true = y_dev[dind]
                            if len(d_l_pred) == 0:
                                continue
                            tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)])
                            dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"development","threshold":thresholds[(m,l,d)]})
                            dl_scores.update({"norm":norm, "is_average_representation":average_representation,"C":C})
                            all_scores.append(dl_scores)
                            auc_scores[d][1].append(dl_scores.get("auc",0))
                            ax[d][1].plot(tpr, fpr, alpha=0.01 if not average_representation else .8, color=f"navy", linewidth=0.5 if not average_representation else 1)
                if args.evaluate_test:
                    for m, mdl_pred in tqdm(enumerate(y_pred_test), total=y_pred_test.shape[0], desc="Test Scoring", file=sys.stdout):
                        for l, latent_pred in enumerate(mdl_pred):
                            for d, dind in enumerate([source_test_ind, target_test_ind]):
                                d_l_pred = latent_pred[dind]
                                d_l_true = y_test[dind]
                                if len(d_l_pred) == 0:
                                    continue
                                tpr, fpr, dl_scores = get_scores(d_l_true, d_l_pred, threshold=thresholds[(m,l,d)])
                                dl_scores.update({"model_n":m,"domain":"source" if d == 0 else "target","group":"test","threshold":thresholds[(m,l,d)]})
                                dl_scores.update({"norm":norm, "is_average_representation":average_representation,"C":C})
                                all_scores.append(dl_scores)
                for i, domain in enumerate(["Source","Target"]):
                    ax[-1,i].set_xlabel("True Positive Rate", fontweight="bold")
                    ax[i, 0].set_ylabel("False Positive Rate", fontweight="bold")
                    for j, group in enumerate(["Train","Development"]):
                        ax[i,j].plot([0,1],[0,1],color="black",linestyle="--")
                        ax[i,j].spines["top"].set_visible(False)
                        ax[i,j].spines["right"].set_visible(False)
                        ax[i,j].set_title(f"{domain} {group}", fontweight="bold")
                        if len(auc_scores[i][j]) > 0:
                            ax[i,j].plot([],[],color="navy",label="Mean AUC: {:.3f}".format(np.mean(auc_scores[i][j])))
                            ax[i,j].legend(loc="lower right")
                        ax[i,j].set_xlim(0,1)
                        ax[i,j].set_ylim(0,1)
                fig.tight_layout()
                fig.savefig(f"{basedir}/classification/roc_auc_{average_representation}_{norm}_{C}{args.plot_fmt}",dpi=300)
                plt.close(fig)
    ## Format Scores
    LOGGER.info("Caching Scores")
    all_scores_df = pd.DataFrame(all_scores)
    if args.fold is not None:
        all_scores_df["fold"] = args.fold
    all_scores_df.to_csv(f"{basedir}/classification/scores.csv",index=False)
    ## Script Complete
    LOGGER.info("Done!")
예제 #23
0
    '''
    extracted_words.append(word)

cohesion_score = {
    word: score.cohesion_forward
    for word, score in words.items()
}
tokenizer = MaxScoreTokenizer(scores=cohesion_score)

#=================LDA trian strat========================
#Generate LDAModel
#k = the number of topic
#alpha = ?
#eta = ?
#min_cf = min frequency
model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5)

for i in raw_chat:
    model.add_doc(tokenizer.tokenize(i))

#check the number of words, vocabulary
#prepare the train
model.train(0)
print('Total docs:', len(model.docs))
print('Total words:', model.num_words)
print('Vocab size:', model.num_vocabs)

#200times training
for i in range(200):
    print('Iteration {}\tLL per word: {}'.format(i, model.ll_per_word))
    model.train(1)
vocab = [vocab[v] for v in vocab_mask]

## Generate Corpus
corpus, missing = generate_corpus(X_source,
                                  X_target,
                                  vocab,
                                  source=True,
                                  target=True)

## Initialize LDA Model
n_iter = 1000
n_burn = 250
model = tp.LDAModel(alpha=0.01,
                    eta=0.01,
                    k=50,
                    min_df=min_user_freq,
                    rm_top=250,
                    corpus=corpus,
                    seed=42)

## Initialize Sampler
model.train(1, workers=8)

## Corpus Parameters
V = model.num_vocabs
N = len(model.docs)
K = model.k

## Gibbs Cache
ll = np.zeros(n_iter)
phi = np.zeros((n_iter, K, V))
예제 #25
0
    def train_model(self, dataset, hyperparameters=None, top_words=10):
        if hyperparameters is None:
            hyperparameters = dict()
        self.set_default_hyperparameters(hyperparameters)
        if self.use_partitions:
            x_train, x_test = dataset.get_partitioned_corpus(
                use_validation=False)
        else:
            x_train = dataset.get_corpus()
            x_test = None

        lda = tp.LDAModel(k=self.hyperparameters['num_topics'],
                          alpha=self.hyperparameters['alpha'],
                          eta=self.hyperparameters['eta'])

        for i in x_train:
            lda.add_doc(i)

        lda.train(self.hyperparameters['max_iters'])

        topic_word_matrix = np.stack([
            lda.get_topic_word_dist(k, normalize=True) for k in range(lda.k)
        ])  # topic word distribution matrix
        topic_document_matrix = np.stack([
            doc.get_topic_dist() for doc in lda.docs
        ])  # topic document distribution matrix

        additional_words = [
            item for item in dataset.get_vocabulary()
            if item not in list(lda.used_vocabs)
        ]
        num_additional_words = len(additional_words)
        if num_additional_words > 0:
            topic_word_matrix = np.concatenate(
                (topic_word_matrix,
                 np.zeros((topic_word_matrix.shape[0], num_additional_words),
                          dtype=float)),
                axis=1)
        #new_topic_word_matrix = np.zeros(topic_word_matrix.shape)
        final_vocab = list(lda.used_vocabs) + additional_words
        vocab2id = {w: i for i, w in enumerate(final_vocab)}

        sorted_indexes = [
            vocab2id[w] for i, w in enumerate(dataset.get_vocabulary())
        ]
        topic_word_matrix = topic_word_matrix[:, sorted_indexes]

        # topics extraction
        topic_w = []
        for k in range(lda.k):
            topics = []
            for word in lda.get_topic_words(k):
                topics.append(word[0])
            topic_w.append(topics)

        # Output model on the Train Set
        info = {}
        info['topics'] = topic_w
        info['topic-word-matrix'] = topic_word_matrix
        info['topic-document-matrix'] = topic_document_matrix.T

        # Inference on the test set
        if x_test is not None:
            doc_inst = [lda.make_doc(i) for i in x_test]
            topic_dist, _ = lda.infer(doc_inst)  # topic document distribution
            info['test-topic-document-matrix'] = np.asarray(topic_dist).T

        return info
예제 #26
0
def lda_param_checker(tw=tp.TermWeight.IDF,
                      min_cf_0=0,
                      min_cf_f=1,
                      min_cf_s=1,
                      min_df_0=0,
                      min_df_f=1,
                      min_df_s=1,
                      rm_top_0=0,
                      rm_top_f=1,
                      rm_top_s=1,
                      k_0=2,
                      k_f=12,
                      k_s=3,
                      alpha_0=-1,
                      alpha_f=0,
                      alpha_s=1,
                      eta_0=0,
                      eta_f=1,
                      eta_s=1,
                      seed=101,
                      corpus=None,
                      burn=100,
                      train=1001,
                      word_list=None,
                      card_count=30,
                      to_excel=False,
                      fname='param_checking.xlsx'):
    """
    Method to automatically iterate through different LDA parameters to compare results
    Parameters
        tw: Union[int, TermWeight]
            term weighting scheme in https://bab2min.github.io/tomotopy/v0.8.0/en/#tomotopy.TermWeight ;
            I chose the default to be inverse document frequency, which means that cards that appear in
            almost all decks are weighted lower than cards that appear in very few decks.
        min_cf_0: int
            Starting minimum card collection frequency
        min_cf_f: int
            Ending minimum card collection frequency
        min_cf_s: int
            Minimum card collection frequency step size
        min_df_0: int
            Starting minimum deck collection frequency
        min_df_f: int
            Ending minimum deck collection frequency
        min_df_s: int
            Minimum deck collection frequency step size
        rm_top_0: int
            Starting number of top cards to exclude
        rm_top_f: int
            Ending number of top cards to exclude
        rm_top_s: int
            Top cards to exclude step size
        k_0: int
            Starting number of topics
        k_f: int
            Ending number of topics
        k_s: int
            Number of topics to increase by per iteration
        alpha_0: int
            Starting number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_0)
        alpha_f: int
            Ending number for the alpha hyperparameter as a power of ten, i.e. alpha = 10^(alpha_f)
        alpha_s: int
            Step size for the powers of ten of the alpha hyperparameter
        eta_0: int
            Starting number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_0)
        eta_f: int
            Ending number for the eta hyperparameter as a power of ten, i.e. eta = 10^(eta_f)
        eta_s: int
            Step size for the powers of ten of the eta hyperparameter
        seed: int
            Random seed. Set to 101 as default in an attempt to duplicate results; however, said
            duplication has proven to be... elusive.
        corpus: tomotopy Corpus
            A list of documents to be added into the model. Method will not function without corpus.
        burn: int
            Number of initial training iterations to discard the results of?
        train: int
            Number of iterations to train over
        word_list: list of lists of strings
            Collection of decklists with each card name represented as a string.
        card_count: int
            Number of cards used to evaluate card coherence.
        to_excel: boolean
            Output the resulting DataFrame to Excel spreadsheet?
        fname: string ending in '.xlsx'
            If to_excel == True, filename of the resulting Excel spreadsheet.
    :return:
        DataFrame that lists the results of the preceding iterations. Contains the following columns:
            k - number of topics
            Avg. LL - Average log likelihood per word (not really sure what this means,
                but I think that lower is better)
            LL Std. Dev. - Log Likelihood standard deviation
            LL CV - Log Likelihood coefficient of variance (Std. Dev./Average)
            Perplexity - Perplexity of the model (don't know what this means,
                but pretty sure that lower is better
            Coherence - (C_V) Coherence of the model. Shooting for ... 0.65? Or between
                0.7 and 0.8? I'm honestly not sure. I think that you'll get better
                results shooting for the latter.
    """

    results_lists = [[
        'tw', 'Min. f_col', 'Min. f_doc', 'Top n Terms Removed', 'k', 'alpha',
        'eta', 'Avg. LL', 'LL Std. Dev.', 'LL CV', 'Perplexity'
    ]]
    average_coherences = []
    coh_std_dev = []
    coh_cv = []
    for cf in range(min_cf_0, min_cf_f, min_cf_s):
        print("Collection Frequency = " + str(cf))
        for df in range(min_df_0, min_df_f, min_df_s):
            print("Document Frequency = " + str(df))
            for rm in range(rm_top_0, rm_top_f, rm_top_s):
                print("Remove Top " + str(rm) + " Words")
                for k in range(k_0, k_f, k_s):
                    print(str(k) + " Topics")
                    for a in range(alpha_0, alpha_f, alpha_s):
                        print("alpha = " + str(10**a))
                        for e in range(eta_0, eta_f, eta_s):
                            print("eta = " + str(10**e))
                            ll_list = []
                            lda = tp.LDAModel(tw=tw,
                                              min_cf=cf,
                                              min_df=df,
                                              rm_top=rm,
                                              k=k,
                                              alpha=10**a,
                                              eta=10**e,
                                              seed=seed,
                                              corpus=corpus)
                            lda.burn_in = burn
                            lda.train(0)
                            for i in range(0, train, 100):
                                lda.train(100)
                                ll_list.append(lda.ll_per_word)
                            lda_mean = sum(ll_list) / len(ll_list)
                            lda_variance = sum([
                                ((x - lda_mean)**2) for x in ll_list
                            ]) / len(ll_list)
                            lda_std_dev = lda_variance**0.5
                            lda_cv = lda_std_dev / lda_mean
                            # I believe that the following method can be used even though it was designed for HDP
                            lda_topics = get_lda_topics(lda, card_count)
                            # I believe that the following method can be used even though it was designed for HDP
                            results_list = [
                                str(tw), cf, df, rm, k, 10**a, 10**e, lda_mean,
                                lda_std_dev, lda_cv, lda.perplexity
                            ]
                            topic_coherences = eval_coherence_by_topic(
                                lda, deck_lists=word_list)
                            results_list.extend(topic_coherences)
                            average_coh = eval_coherence(lda_topics, word_list)
                            average_coherences.append(average_coh)
                            coh_variance = sum([((x - average_coh)**2)
                                                for x in topic_coherences
                                                ]) / len(topic_coherences)
                            coh_std_dev.append(coh_variance**2)
                            coh_cv.append((coh_variance**2) / average_coh)
                            results_lists.append(results_list)
    for num_top in range(0, lda.k):
        results_lists[0].append('Top ' + str(num_top) + ' Coherence')
    df = pd.DataFrame(data=results_lists[1:], columns=results_lists[0])
    df['Average Coherence'] = average_coherences
    df['Coherence Std Dev'] = coh_std_dev
    df['Coherence CV'] = coh_cv
    if to_excel:
        df.to_excel(fname, encoding='utf-8')
    return df
예제 #27
0
import tomotopy as tp

model = tp.LDAModel(k=2, seed=42)
model.add_doc(["this", "is", "a", "test"])
model.add_doc(["another", "document"])
model.add_doc(["a", "new", "document"])
model.train(100)
print(model.docs[0].get_topics())
sample_ai_ids = set(choice(list(ai_ids), 30000, replace=False))
reference_exc = set(references_ids) - sample_ai_ids
citation_exc = set(pre_2021_papers) - sample_ai_ids

selected_ids = sample_ai_ids | reference_exc | citation_exc

# Removing empty documents
selected_corpus = {
    k: v
    for k, v in tok.items() if k in selected_ids and len(v) > 0
}

# +
# Train a topic model and extract the topic mix

mdl = tp.LDAModel(k=300)
for key, doc in list(selected_corpus.items()):
    mdl.add_doc(doc)

for i in range(0, 150, 10):
    mdl.train(10)
    print('Iteration: {}\tLog-likelihood: {}'.format(i, mdl.ll_per_word))

# for k in range(mdl.k):
#     print('Top 10 words of topic #{}'.format(k))
#     print(mdl.get_topic_words(k, top_n=10))

mdl.summary()

# Calculate topic distributions by category and compare with AI: are they significantly different?
# We could check this by comparing the means of the log of