Пример #1
0
def make_lda_model():
    tfidf_model = TfidfModel.load((output_dir / 'tfidf_model.pkl').as_posix())
    lda_model = LdaModel(nmf_iterator(
        CONTENT_FILES, Dict.load((output_dir / 'dict.pkl').as_posix()),
        tfidf_model),
                         num_topics=TOPIC_NUM)
    lda_model.save((output_dir / 'lda_model.pkl').as_posix())
Пример #2
0
def get_lda_model(doc_term_matrix, id2word, fname):
    try:
        lda_model = LdaModel.load(fname)
    except:
        lda_model = LdaModel(
            corpus=doc_term_matrix,
            id2word=id2word,
            num_topics=params['num_topics'],
            chunksize=params['chunksize'],
            random_state=100,
            update_every=1,  # online iterative learning
            passes=2,
            distributed=False,
            # alpha='auto',
            per_word_topics=True)

    _save_model(lda_model, fname=fname)

    return lda_model
Пример #3
0
def _load_model(type, fname='../../model/'):
    try:
        if type == 'lsi':
            return LsiModel.load(fname)
        elif type == 'lda':
            return LdaModel.load(fname)
        elif type == 'mallet':
            return LdaMallet.load(fname)
    except:
        return None
Пример #4
0
def topic_model_visualize(textlist: list, num_topics: int) -> None:
    """Визуализация тематической модели"""
    textlist = [textlist]
    common_dictionary = Dictionary(textlist)
    common_corpus = [common_dictionary.doc2bow(text) for text in textlist]
    lda = LdaModel(common_corpus, num_topics=num_topics)

    vis = pyLDAvis.gensim.prepare(lda, common_corpus, common_dictionary)
    pyLDAvis.save_html(vis, 'LDA.html')
    pyLDAvis.show(data=vis, open_browser=True)
Пример #5
0
def get_lda_model(doc_term_matrix, id2word, fname, num_topics=None):

    if params['training']:
        lda_model = LdaModel(corpus=doc_term_matrix,
                             id2word=id2word,
                             num_topics=params['num_topics']
                             if num_topics is None else num_topics,
                             passes=5,
                             per_word_topics=True)
        _save_model('lda', lda_model, fname=fname)
    else:
        lda_model = _load_model('lda', fname)

    return lda_model
Пример #6
0
def _load_model(model_type, fname):
    logger.info(f'{model_type} type of {fname} is loading..')
    try:
        if model_type == 'lsi':
            return LsiModel.load(f'../model/lsi_model/{fname}')
        elif model_type == 'lda':
            return LdaModel.load(f'../model/lda_model/{fname}')
        elif model_type == 'mallet':
            return LdaMallet.load(f'../model/mallet_model/{fname}')
        elif model_type == 'hdp':
            return HdpModel.load(f'../model/mallet_model/{fname}')
    except Exception as ex:
        logger.warning(f'{model_type} type of {fname} could not be loaded.',
                       exc_info=ex)
        return None
Пример #7
0
def get_lda_model_byDomains(domains):
    """ Создать LDA модель из заданных ссылок
    :param domains: имена сообществ VK
    """

    common_texts = normilize_texts(domains[0])

    for i in range(1, len(domains)):
        common_texts += normilize_texts(domains[i])

    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(common_corpus, num_topics=len(domains))

    return lda
Пример #8
0
def process_content(filename, chunk_size=CHUNK_SIZE):
    model = LdaModel.load((output_dir / 'lda_model.pkl').as_posix())
    temp_dict = Dict.load((output_dir / 'dict.pkl').as_posix())
    out_file = output_dir / (filename.stem + '_lda.csv')
    # touch one while not exist
    if not out_file.is_file():
        Path(out_file).touch()

    for chunk in pd.read_csv(filename,
                             usecols=['id', 'content'],
                             chunksize=chunk_size):
        chunk['content'] = chunk['content'].str.lower().str.split() \
            .apply(lambda doc: model[temp_dict.doc2bow(doc)])

        chunk.to_csv(out_file, mode='a')
Пример #9
0
 def build(self, _label):
     modelfile = "./models/{0}.model".format(_label)
     dictfile = "./models/{0}.dict".format(_label)
     corpusfile = "./models/{0}.mm".format(_label)
     if os.path.isfile(modelfile):
         dictionary = corpora.Dictionary.load(dictfile)
         corpus = corpora.MmCorpus(corpusfile)
         ldamodel = LdaModel.load(modelfile)
     else:
         texts = []
         for tweetid, label in self.labels.items():
             if label == _label:
                 texts.extend(self.getTweetTexts(tweetid))
         dictionary, corpus, ldamodel = self.buildModel(texts)
         dictionary.save(dictfile)
         corpora.MmCorpus.serialize(corpusfile, corpus)
         ldamodel.save(modelfile)
     return dictionary, corpus, ldamodel
Пример #10
0
def get_lda_model2(doc_term_matrix, id2word, fname, num_topics=None):

    if params['training']:
        lda_model = LdaModel(
            corpus=doc_term_matrix,
            id2word=id2word,
            num_topics=params['num_topics']
            if num_topics is None else num_topics,
            chunksize=3000,
            passes=20,
            alpha='auto',
            # eta='auto',
            iterations=100,
            per_word_topics=True)
        _save_model('lda', lda_model, fname=fname)
    else:
        lda_model = _load_model('lda', fname)

    return lda_model
Пример #11
0
def LDA(textlist: list, num_topics: int):
    textlist = [textlist]
    common_dictionary = Dictionary(textlist)
    common_corpus = [common_dictionary.doc2bow(text) for text in textlist]
    lda = LdaModel(common_corpus, num_topics=num_topics)
    return lda
Пример #12
0
    end_date = args.end_date
    n_topics = args.num_topics
    n_articles = args.top_na
    n_vocabs = args.top_nv

    DIR_NAME = os.path.join('./dirs', f"{category}-{start_date}-{end_date}")
    with open(os.path.join(DIR_NAME, 'corpus'), 'rb') as f:
        corpus = pickle.load(f)
    dictionary = Dictionary(corpus)
    gensim_corpus = [dictionary.doc2bow(doc) for doc in corpus]

    if args.estimate == 'y':
        print("\nEstimating parameters of LDA model")
        start = time.time()
        model = LdaModel(gensim_corpus,
                         id2word=dictionary,
                         num_topics=n_topics)
        model.save(datapath(f"{category}-{start_date}-{end_date}"))
        minute, second = list(map(int, divmod(time.time() - start, 60)))
        print(f">>> Elapsed time : {minute}m {second}s")

    print(
        f"\nSaving DataFrame of top {n_articles} relevant articles per topic and {n_vocabs} vocabularies from each topic"
    )
    model = LdaModel.load(datapath(f"{category}-{start_date}-{end_date}"))
    start = time.time()

    topn_articles = docs_by_topic(model, gensim_corpus, n_articles)
    with open(os.path.join(DIR_NAME, f"topn_articles_{start_date[:6]}"),
              'wb') as f:
        pickle.dump(topn_articles, f)
def grid_search_lda_ASM(texts,
                        n_topics_range, iterations, passes,
                        out_dir, verbose=True, save_doc_top=True):
    '''Fit topic models and search for optimal hyperparameters.

    LDA will be fitted for each number of topics,
    returned will be the model, it's coherence score and
    corresponding _asymmetrical_ priors the model learned (alpha and eta)


    Parameters
    ----------
    texts : list
        preprocessed corpus, where texts[0] is a document
        and texts[0][0] is a token.

    n_topics_range : range of int
        range of integers to use as the number of topics
        in interations of the topic model.

    iterations : int
        maximum number of iterations for each topic models

    passes : int
        maximum number of passes (start iterations again) for each topic models

    out_dir : str
        path to a directory, where results will be saved (in a child directory).

    verbose : bool
        give comments about the progress?

    save_doc_top : bool
        save documet-topic matices from models?


    Exports
    -------
    out_dir/report_lines/*
        pickled dict with model information
        (n topics, model coherence, per-topic coherence, hyperparameters)
        
    out_dir/models/*
        gensim objects, where the model is saved.
        
    out_dir/plots/*
        pyLDAvis visualizations of the model
    '''
    # check how legit out_dir is
    make_folders(out_dir)

    # if a single model is to be fitted,
    # make sure it can be "iterated"
    if isinstance(n_topics_range, int):
        n_topics_range = [n_topics_range]

    # input texts to gensim format
    dictionary = corpora.Dictionary(texts)
    bows = [dictionary.doc2bow(tl) for tl in texts]

    # iterate
    report_list = []
    for n_top in chain(n_topics_range):

        if verbose:
            print("{} topics".format(n_top))

        start_time = time()

        # paths for saving
        ## it's not very elegant defining the paths here
        ## after there already is funciton make_folders
        filename = str(n_top) + "T_" + 'ASM'
        report_path = os.path.join(
            out_dir,
            'report_lines',
            filename + '.ndjson'
        )

        model_path = os.path.join(
            out_dir,
            'models',
            filename + '.model'
        )

        pyldavis_path = os.path.join(
            out_dir,
            'plots',
            filename + '_pyldavis.html'
        )

        doctop_path = os.path.join(
            out_dir,
            'doctop_mats',
            filename + '_mat.ndjson'
        )

        # train model
        # TODO: higher / cusomizable fine hyperparameters?
        model = LdaModel(
            corpus=bows,
            iterations=iterations,
            ## optimizing hyperparameters
            num_topics=n_top,
            alpha='auto',
            eta='auto',
            ## fine hyperparameters
            decay=0.5,
            offset=1.0,
            eval_every=10,
            gamma_threshold=0.001,
            minimum_probability=0.01,
            minimum_phi_value=0.01,
            ## utility
            random_state=None,
            per_word_topics=False,
            id2word=dictionary,
            passes=passes)

        # track time usage
        training_time = time() - start_time
        if verbose:
            print('    Time: {}'.format(training_time))

        # coherence
        coherence_model = CoherenceModel(
            model=model,
            texts=texts,
            corpus=bows,
            coherence='c_v'
        )

        coh_score = coherence_model.get_coherence()
        coh_topics = coherence_model.get_coherence_per_topic()

        if verbose:
            print('    Coherence: {}'.format(coh_score.round(2)))

        # save priors
        alpha = model.alpha.tolist()
        eta = model.eta.tolist()

        # save report
        report = (n_top, alpha, eta, training_time, coh_score, coh_topics)
        report_list.append(report)
        with open(report_path, 'w') as f:
            ndjson.dump(report, f)

        # save model
        model.save(model_path)

        # produce a visualization
        # it is imperative that sort_topics should never be turned on!
        vis = pyLDAvis.gensim.prepare(
            model, bows, dictionary, sort_topics=False
        )

        pyLDAvis.save_html(vis, pyldavis_path)

        # save document-topic matrix
        if save_doc_top:
            # keep minimum_probability at 0 for a complete matrix
            doc_top = [model.get_document_topics(doc, minimum_probability=0)
                       for doc in model[bows]]

            # unnest (n topic, prob) tuples
            # float to convert from np.float32 which is not
            # JSON serializable
            doc_top_prob = [
                [float(prob) for i, prob in doc]
                for doc in doc_top
            ]

            # save the matrix as ndjson
            with open(doctop_path, 'w') as f:
                ndjson.dump(doc_top_prob, f)

    return None
Пример #14
0
def lda_mod(domains):
    common_texts = redact_finish(domains)
    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(common_corpus, num_topics=len(domains))
    return lda