Пример #1
0
def load_lda_model(lda_model_name=None, mallet=False):
    if os.path.isfile(lda_model_name):
        if mallet:
            lda_model = LdaMallet.load(lda_model_name)
        else:
            lda_model = LdaModel.load(lda_model_name)
        return lda_model
    return None
Пример #2
0
def build_lda(text_corpus=None, dictionary=None, n_topics=10, mallet=True, dataname="none"):
    """
    Given a text corpus builds an LDA model (mallet or gensim) and saves it.

    :param text_corpus: text corpus *not* BOW!!
    :param dictionary: dictionary defining tokens to id
    :param n_topics:  number of tokens
    :param mallet: mallet LDA or gensim LDA
    :param dataname: basename of the LDA model
    :return: the name of the LDA model
    """

    if mallet:
        mallet_path = os.environ.get("MALLETPATH")
        lda_model = LdaMallet(mallet_path, corpus=text_corpus, num_topics=n_topics, id2word=dictionary, workers=4,
                              optimize_interval=10, iterations=1000, prefix=os.path.join(os.getcwd(), 'mallet/'))
    else:
        lda_model = LdaModel(text_corpus, id2word=dictionary, num_topics=n_topics, distributed=False,
                             chunksize=2000, passes=5, update_every=10, alpha='asymmetric',
                             eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001)

    lda_model_name = make_lda_model_name(dataname, n_topics=n_topics, mallet=mallet)
    lda_model.save(lda_model_name)
    return lda_model_name
Пример #3
0
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="",
                  mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet",
                  dataname="none"):

    with open(dataname+"_log.txt", 'a') as fout:

        if dataset_raw.include_date:
            dates = [text[1] for text in dataset_raw]
            dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw]
        else:
            dates = ["" for _ in dataset_raw]
            dataset = dataset_raw

        bi_grams = Phrases(dataset, threshold=3)
        dataset = bi_grams[dataset]


        dictionary = Dictionary(dataset)
        dictionary.filter_extremes(no_below=1, no_above=0.9)

        bow_corpus = [dictionary.doc2bow(text) for text in dataset]

        fout.write("# Topics: %s\n" % n_topics)

        if not os.path.isfile(lda_model_name):

            if mallet:
                lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4,
                                     optimize_interval=10, iterations=1000)
                lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics)
            else:
                lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False,
                                    chunksize=2000, passes=5, update_every=10, alpha='asymmetric',
                                    eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001)

                lda_model_name = "lda_model_%s_%i" % (dataname, n_topics)

            lda_model.save(lda_model_name)

        else:
            if mallet:
                lda_model = LdaMallet.load(lda_model_name)
            else:
                lda_model = LdaModel.load(lda_model_name)

        topic_definition = []

        for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)):
            fout.write("%i \n" % i)
            topic_list = []
            freq_list = []
            a_list = []
            for tup in topic:
                topic_list.append(tup[1])
                freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] )
                a_list.append(tup[0])


            fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) ))))

            topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1]))

        fout.write("Total number of documents: %i\n" % dictionary.num_docs )



        earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014")

        a = [tup for tup in  sorted(zip(bow_corpus, dates), key=get_date )
             if dateutil.parser.parse(tup[1]) > earliest_date]

        print len(a)
        print a[len(a)-1]
        latest_date = dateutil.parser.parse(a[len(a)-1][1])

        num_bins = 100

        time_span = latest_date - earliest_date
        print time_span
        time_bin = time_span / num_bins
        print time_bin

        bin_lows = [earliest_date]
        bin_high = earliest_date + time_bin
        counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)]
        i=0
        for text in a:
            topic_assignments = lda_model[text[0]]
            date_str = text[1]
            if date_str is not None:
                cur_date = dateutil.parser.parse(date_str)
                if cur_date >= bin_high:
                    i+=1
                    bin_lows.append(bin_high)
                    bin_high = bin_lows[len(bin_lows)-1] + time_bin
                #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1
                for tup in topic_assignments:
                    counts[i][tup[0]] += tup[1]

        fout.write("Number of documents assigned mostly to the topic: \n")
        fout.write("%s\n" % counts)

        a = 1.*np.array(counts)

        np.savetxt("mpeketoni_cnts.txt", a)
        with open("mpeketoni_bins.txt", 'w') as fout:
            for date in bin_lows:
                fout.write("%s\n" % date)
        with open("mpeketoni_labels.txt", 'w') as fout:
            for label in topic_definition:
                fout.write("%s\n" % label)

        return a, bin_lows, topic_definition