예제 #1
0
파일: models.py 프로젝트: Kamaros/fnc-1
def lda(dataframe, num_topics=100):
    """Returns an LDA model for documents stored in a DataFrame.

    Precomputed models are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    num_topics : int (default is 300)
        The number of topics to train the LDA model with.

    Returns
    -------
    model : Gensim LdaMulticore
        LDA model for documents stored in the DataFrame.
    """
    filename = 'caches/models/lda.model'

    if not os.path.isfile(filename):
        dictionary = dictionary_corpus(dataframe)
        bow = bow_corpus(dataframe)
        lda_model = LdaModel(bow,
                             id2word=dictionary,
                             num_topics=num_topics,
                             passes=20)
        lda_model.save(filename)
    else:
        lda_model = LdaModel.load(filename)

    return lda_model
예제 #2
0
def train_lda(load_model, corpus, num_topics, dictionary):
    # Train LDA model.
    if load_model and os.path.exists(MODEL_PATH):
        model = LdaModel.load(MODEL_PATH)
    else:
        # Set training parameters.
        chunksize = 2000
        passes = 20  #epoch number
        iterations = 400
        eval_every = None  # Don't evaluate model perplexity, takes too much time.
        # Make a index to word dictionary.
        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         chunksize=chunksize,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)
        model.save(MODEL_PATH)
    return model
예제 #3
0
def lda(clean_docs, model_name, topics):
    # turn all data into a dictionary mappping of normalized words and their integer ids
    from gensim import corpora
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LDA model
    from gensim.models import LdaModel
    num_topics = topics  # find this number of topics in the data
    passes = 15

    ldamodel = LdaModel(corpus,
                        num_topics=num_topics,
                        id2word=dictionary,
                        passes=passes)
    ldamodel.save('model_' + model_name + '.gensim')
    topics = ldamodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
예제 #4
0
class LdaVec(TopicVec):
    def __init__(self, vec_num):
        TopicVec.__init__(self, vec_num)

    def __gen_model(self, corpus):
        # if self.p_corpus == 'onehot':
        #     model_name = 'lda_one_hot.model'
        # else:
        #     model_name = 'lda_tfidf.model'
        model_name = 'lda.model'
        self.model = LdaModel(corpus,
                              id2word=self.dictionary,
                              num_topics=self.vec_num)
        self.model.save(os.path.join(self.out_dir, model_name))

    def __get_model(self):
        model_name = 'lda.model'
        if os.path.exists(os.path.join(self.out_dir, model_name)):
            self.model = LdaModel.load(os.path.join(self.out_dir, model_name))
        else:
            raise FileNotFoundError('"{}" file not found!'.format(model_name))

    def fit(self, doc, out_dir, use_exist_dictionary=False):
        TopicVec.fit(self, doc, out_dir, use_exist_dictionary)
        self.__gen_model(self.corpus)
예제 #5
0
def train_lda(is_tfidf, num_topics):
    # Create corpus
    print('Create corpus')
    corpus = doc_processor.create_corpus(dictionary, doc_list, is_tfidf)

    # Set training parameters.
    num_topics = num_topics
    chunksize = 20000
    # passes = 20
    # iterations = 400
    eval_every = None

    print('Start LDI training')
    start = time.time()
    id2word = dictionary.id2token

    lda_model = LdaModel(
        corpus=corpus,
        # id2word=id2word,
        chunksize=chunksize,
        # alpha='auto',
        # eta='auto',
        num_topics=num_topics,
        # passes=passes,
        # iterations=iterations,
        eval_every=eval_every
    )
    
    ir_method = 'tfidf'  if is_tfidf else 'bow'
    lda_model.save('saved_models/lda_model_%s_%s' % (ir_method, num_topics))
    print('LDA for %s %s done in %.1f seconds' % (ir_method, num_topics, time.time() - start))
def train_model():
    dictionary = pickle.load(codecs.open('dictionary.pkl'))
    train = pickle.load(codecs.open('train.pkl'))
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    #模型的保存/ 加载
    lda.save('test_lda.model')
예제 #7
0
파일: LDA_model.py 프로젝트: NEUljw/WN2WD
def train_model(num_topic):
    train = get_dict()
    dictionary = Dictionary.load('train_data.dict')  # Path
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic)
    # 模型的保存
    lda.save('LDA_trained_model/lda.model')  # Path
def train(corpuspath,modelpath):
    train = []
    # stopwords = codecs.open('stopWords/1893(utf8).txt','r',encoding='utf8').readlines()
    # stopwords = [ w.strip() for w in stopwords ]
    fp = codecs.open(corpuspath, 'r', encoding='utf8')
    for line in fp:
        line = line.strip()
        if line == '':continue
        line = line.split()
        train.append([w for w in line])

    dictionary = corpora.Dictionary(train)
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=20)
    lda.save(modelpath)

    topic_words = open('../result/lda/fact_lad_print-10.txt','w',encoding='utf-8')
    print_str = ''
    for topic in lda.print_topics(num_words=100):
        termNumber = topic[0]
        listOfTerms = topic[1].split('+')
        # for term in listOfTerms:
        #     listItems = term.split('*')
        #     # print(listItems)
        #     print('  ', listItems[1], '(', listItems[0], ')', sep='')
        print_str += topic[1] + '\n'
    topic_words.write(print_str)
    topic_words.close()
예제 #9
0
def lda_features(
    sentence_words,
    lexicon,
    model_path,
    num_topics=50,
    mode='train',
):
    dictionary = corpora.Dictionary([[lex] for lex in lexicon])
    corpus = [dictionary.doc2bow(words) for words in sentence_words]
    if mode == 'train' and not os.path.exists(model_path):
        ldamodel = LdaModel(corpus, num_topics=num_topics)
        ldamodel.save(model_path)
    else:
        ldamodel = LdaModel.load(model_path)

    features = []
    for sentence in corpus:
        lda_f = ldamodel[sentence]
        feats = np.zeros((num_topics, ))
        for (n_t, s_t) in lda_f:
            feats[n_t] = s_t
        features.append(feats)

    result = np.asarray(features)
    return result
예제 #10
0
def main(num_topics):
    f_path = os.path.join(DATA_PATH, 'interim',
                          'onsite_search_nlp_gensim_dictionary.pkl')
    with open(f_path, 'rb') as f:
        dictionary = pickle.load(f)
    print('Loaded dictionary: {}'.format(dictionary))

    f_path = os.path.join(DATA_PATH, 'interim',
                          'onsite_search_terms_2017_2019_nlp.pkl')
    df_search_terms = pd.read_pickle(f_path)
    print('Loaded search corpus: {} rows'.format(len(df_search_terms)))

    print('Logging to terminal')
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    print('Starting model training...')
    print()
    ldamodel = LdaModel(
        corpus=df_search_terms.corpus.dropna().tolist(),
        num_topics=num_topics,
        id2word=dictionary,
    )

    print()
    print('Done training, saving to file')
    f_path = 'onsite_search_terms_lda_2017_2019_{}_topic.model'.format(
        num_topics)
    ldamodel.save(f_path)
예제 #11
0
def save_model():
    """
    保存LDA模型
    :param model_path:
    :return:
    -----------------
    corpus:[
            [('词ID', 词频),('词ID', 词频)...],
            [('词ID', 词频),('词ID', 词频)...],
            .......
            ] 稀疏向量集
    id2word: {'词1':0, '词2':1. ..}

    """
    train_set = get_train_set()
    word_dict = Dictionary(train_set)  # 生成文档的词典,每个词与一个整型索引值对应
    corpus_list = [word_dict.doc2bow(text)
                   for text in train_set]  # 词频统计,转化成空间向量格式
    lda = LdaModel(
        corpus=corpus_list,
        id2word=word_dict,
        num_topics=100,
        # passes=5, # epoch
        alpha='auto')
    lda.print_topic(99)
    # 保存LDA 模型
    lda.save(lda_model_path)
예제 #12
0
def train_model():
    dictionary = get_dict()[0]
    train = get_dict()[1]
    corpus = [dictionary.doc2bow(text) for text in train]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=7)
    #模型的保存/ 加载
    lda.save('test_lda.model')
def get_lda_model():
    """
    (50,28767)
    获得话题
    :return:
    """
    text_array = list()

    with open("jobs-unigrams-filter") as f:
        for line in tqdm(f):
            line = line.strip().split(" ")
            line.remove(line[0])
            text_array.append(line)

    dictionary = Dictionary(text_array)
    # print(common_dictionary)
    common_corpus = [dictionary.doc2bow(text) for text in text_array]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus,
                   id2word=dictionary,
                   num_topics=50,
                   passes=10,
                   iterations=1000)
    temp_file = datapath("LDA_twitter")
    lda.save(temp_file)
    topics = lda.get_topics()
    print(topics.shape)

    topic_list = lda.print_topics(50)
    for topic in topic_list:
        print(topic)
예제 #14
0
def run(data_name):
    print('Working on ' + data_name)
    corpus = []

    # preprocess
    with open('../data/' + data_name + '/' + data_name + '.tsv') as dfile:
        dfile.readline()
        for line in dfile:
            line = line.strip().split('\t')
            corpus.append(line[1].split())

    # build dictionary
    dictionary = Dictionary(corpus)
    dictionary.save(data_name + '.dict')

    # documents to indices
    doc_matrix = [dictionary.doc2bow(doc) for doc in corpus]
    del corpus  # release memory
    ldamodel = LdaModel(doc_matrix,
                        id2word=dictionary,
                        num_topics=10,
                        passes=2,
                        alpha='symmetric',
                        eta=None)
    ldamodel.save(data_name + '.model')
예제 #15
0
def create_lda_model(project, corpus, id2word, name, use_level=True, force=False):
    model_fname = project.full_path + name + str(project.num_topics)
    if use_level:
        model_fname += project.level

    model_fname += '.lda.gz'


    if not os.path.exists(model_fname) or force:
        if corpus:
            update_every=None # run in batch if we have a pre-supplied corpus
        else:
            update_every=1

        model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         alpha=project.alpha,
                         eta=project.eta,
                         passes=project.passes,
                         num_topics=project.num_topics,
                         iterations=project.iterations,
                         eval_every=None, # disable perplexity tests for speed
                         update_every=update_every,
                         )

        if corpus:
            model.save(model_fname)
    else:
        model = LdaModel.load(model_fname)

    return model, model_fname
예제 #16
0
def train(data, valid_h1, valid_h2, vocab):

    #logging.basicConfig(filename=args.save_path + 'lda.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = LdaModel(id2word=vocab,
                     num_topics=args.topics,
                     random_state=0,
                     chunksize=args.batch_size,
                     update_every=args.batch_size,
                     alpha='auto',
                     eta=None,
                     decay=args.decay,
                     offset=args.offset,
                     per_word_topics=True)

    best_perplexity = float('inf')

    for epoch in range(args.epochs):

        model.update(data, passes=1, eval_every=1, gamma_threshold=0.001)
        print("Epoch number {}".format(epoch), end=' ')

        val_perplexity = evaluate(data, valid_h1, valid_h2, model, 'valid')
        if val_perplexity < best_perplexity:
            best_perplexity = val_perplexity
            model.save(os.path.join(args.save_path, 'model.ckpt'))
예제 #17
0
def lda_main(word_with_pos = WORD_WITH_POS, topic_num = LDA_TOPIC_NUM):
    LDA_MODEL = './models/lda_{}.model'.format(topic_num)
    stop_word = read_stopword()
    begin_t = time.time() 
    def func(line):
        '''
        捆绑词性是否
        '''
        line = line.strip()
        json_data = json.loads(line)
        content = json_data['content']
        if word_with_pos:
            word_list = [j[0] + j[1] for j in content if j[0] not in stop_word]
        else:
            word_list = [j[0]for j in content if j[0] not in stop_word]
            
        return word_list
    
    with open(DATA_JSONLINE) as f:
        
        # words = [func(i) for i in f.readlines()]
        words = []
        for i in f.readlines():
            words.append(func(i))
        print('数据装载完毕! use ', time.time()-begin_t, 'sec.\n begin lda modeling')
        dic = corpora.Dictionary(words)
        corpus = [dic.doc2bow(text) for text in words]
        dic.save(DICTIONARY_PATH)
        corpora.MmCorpus.serialize(CORPUS_PATH, corpus)
        lda = LdaModel(corpus=corpus, id2word=dic, num_topics=topic_num)
        lda.save(LDA_MODEL)
        vis_data = pyLDAvis.gensim.prepare(lda, corpus, dic)
        vis_html_path = 'ldavis_{}.html'.format(topic_num)
        pyLDAvis.save_html(vis_data, vis_html_path)
        print('LDA 建模完成!\nTotal use:', time.time()- begin_t, 'sec.')
예제 #18
0
def chosen_lda(corpus, dictionary, data, n_topics, alpha=.1, eta=0.01):
    '''
    This function trains a Gensim LDA model on chosen hyperparameters
    
    Arguments:
    ----------
    corpus : matrix-format corpus (BOW or TF-IDF)
    dictionary : corpus-related dictionary
    data : text data for coherence score computation
    n_topics : number of desired topics
    alpha : alpha parameter (from 0 to infinity)
    eta : beta parameter (from 0 to infinity)
    
    Outputs:
    ----------
    lda : trained model
    '''
    
    lda = LdaModel(corpus=corpus, 
                id2word=dictionary, 
                num_topics=35, 
                random_state=100, 
                alpha=alpha, 
                eta=eta)
    
    ldatopics = [[word for word, prob in topic] for topicid, topic in lda.show_topics(formatted=False)]
    lda_coherence = CoherenceModel(topics=ldatopics, texts=data, dictionary=dictionary, window_size=10).get_coherence()
    print(lda_coherence)
    lda.print_topics(num_topics=n_topics)
    
    lda.save('../03_Dump/model')
    return lda
예제 #19
0
    def set_model(self, lang: str, data_version: int, dictionary_version: float, model_version: str, param_name: str, param_version: int,
                  model_file_path: str, language_processed_data: list):
        """
        'alpha'& 'eta' are hyperparameters that affect sparsity of the topics.
        According to the Gensim docs, both defaults to 1.0/num_topics prior.
        :return:
        """
        # Make a index to word dictionary.
        logging.info("---- Creating LDA model")
        temp = self.essentials.dictionary[0]
        "for multicore model optimal workers=3, one less than the number of cores"
        model = LdaModel(
            # workers=self.workers,
            corpus=self.essentials.corpus,
            id2word=self.essentials.dictionary.id2token,
            chunksize=self.chunk_size,
            alpha=self.alpha,
            eta=self.beta,
            iterations=self.iterations,
            num_topics=self.number_of_topics,
            passes=self.passes,
            eval_every=self.eval_every
        )
        model.save(model_file_path)
        self.model = model
        logging.info("---- LDA model is created")

        metrics = self.get_model_evaluation_metrics(language_processed_data)
        parameters = self.get_model_parameters()
        self.write_model_evaluation_metrics(lang, data_version, dictionary_version, model_version,param_name, param_version, metrics, parameters)
        return
def save_model(model: LdaModel, path='../artefacts/model', suffix=''):
    """Helper function to save Gensim LdaModel at specified path
    """
    if suffix:
        path = path + '_' + suffix
    model.save(path)
    print(f'model saved at {path}')
예제 #21
0
파일: lda.py 프로젝트: huaiwen/GraBTax
def train_lda(
    corpus,
    dictionary,
    save=False,
    file=os.path.join(config.map("Storage")['storage_dir'] + 'lda.mdl')):
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=50,
                   update_every=1,
                   chunksize=10000,
                   passes=10)
    if save:
        lda.save(file)

    #cm = CoherenceModel(model=lda, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    #print(cm.get_coherence())
    import pyLDAvis.gensim
    topicmodel = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
    html = pyLDAvis.display(topicmodel)
    import webbrowser
    with open('viz.html', 'w') as f:

        message = html.data

        f.write(message)

    webbrowser.open_new_tab('viz.html')
예제 #22
0
def LDA_model_from_token(text_file_name):
    token_file_name = text_file_name[:-4] + '.csv'
    print("loading "+token_file_name)
    data_word = []
    with codecs.open(token_file_name, 'r') as f:
        rdr = csv.reader(f)
        next(rdr)
        for i, line in enumerate(rdr):
            data_word.append(line)
        print("Complete loading")


    id2word=corpora.Dictionary(data_word)
    id2word.filter_extremes(no_below = 10) #10회 이하로 등장한 단어는 삭제
    texts = data_word
    corpus=[id2word.doc2bow(text) for text in texts]

    lda = LdaModel(corpus, num_topics=10, id2word=id2word)

    temp_file = datapath(token_file_name[:-4])
    lda.save(temp_file)

    lda = LdaModel.load(temp_file)

    topics = lda.print_topics(num_words=10)
    for topic in topics:
        print(topic)
예제 #23
0
def build_lda_model(dictionary, corpus, should_rebuild):
    lda = list()

    # DEBUG
    should_rebuild = True

    # debug_print('datapath:LDA', datapath(cfg.LDA_BACKUP))

    if not should_rebuild:
        try:
            print('Loading LDA Model backup...')
            lda_file = utils.get_file_path(cfg.LDA_BACKUP)
            print('LDA file = {}'.format(lda_file))

            lda = LdaModel.load(lda_file)

        except Exception as exc:
            utils.print_exception_details('Building LDA Model', exc)

    else:
        print('Building LDA Model...')
        lda = LdaModel(corpus,
                       id2word=dictionary,
                       random_state=cfg.RANDOM_STATE,
                       num_topics=cfg.NUM_TOPICS,
                       passes=cfg.NUM_PASSES)
        print('Done!')
        # Save Model Structures
        LDA_FILE = utils.get_file_path(cfg.LDA_BACKUP)
        lda.save(LDA_FILE)

    return lda
예제 #24
0
파일: lda.py 프로젝트: pengm-hub/HIN
def save_ldamodel(dictionary, text_data, cnt_cata):

    corpus = [dictionary.doc2bow(text) for text in text_data]
    ldamodel = LdaModel(corpus, num_topics=cnt_cata, id2word=dictionary)
    # 查看主题
    for topic in ldamodel.print_topics():
        print(topic[1])
    ldamodel.save('model/{}/ADA.gensim'.format(cnt_cata), "wb")
예제 #25
0
def lda_gensim(id2word,
               doc2bow,
               n_topics=params.lda_params_default['n_topics']):
    """
    Implements gensim LDA algorithm.

    Parameters
    ----------
    id2word
        Maps token IDs to words
    doc2bow
        Maps documents to bag-of-words lists
    n_topics : int
        Total number of topics

    Returns
    -------
    model
        Trained LDA model

    """

    try:
        model = LdaModel.load('lda_model_{}'.format(n_topics))
        # coh_model_umass = CoherenceModel.load('umass_coherence_model_{}'.format(n_topics))
        # coh_model_cv = CoherenceModel.load('cv_coherence_model_{}'.format(n_topics))

    except FileNotFoundError:
        # Trains LDA model and returns key words for each topic
        model = LdaModel(corpus=doc2bow,
                         id2word=id2word,
                         iterations=500,
                         num_topics=n_topics,
                         random_state=1,
                         alpha='auto',
                         eta='auto',
                         )

        model.save('lda_model_{}'.format(n_topics))

        '''
        print('Training coherence models...')
        coh_model_umass = CoherenceModel(model=model,
                                         corpus=doc2bow,
                                         dictionary=id2word,
                                         coherence='u_mass',
                                         )
        coh_model_umass.save('umass_coherence_model_{}'.format(n_topics))
        
        coh_model_cv = CoherenceModel(model=model,
                                      texts=corpus_text.values,
                                      dictionary=id2word,
                                      coherence='c_v',
                                      )
        # coh_model_cv.save('cv_coherence_model_{}'.format(n_topics))
        '''

    return model
예제 #26
0
class Lda(ModelABC):
    """Represent news articles as vectors using Latent Dirichlet Allocation."""
    def __init__(self,
                 dictionary: Dictionary,
                 corpus=None,
                 size: int = 100,
                 decay=0.5,
                 lda_filename: str = None):
        """
        :param dictionary: A dictionary
        :param corpus: A corpus for training
        :param size: The length of feature vector
        :param decay: The decay parameter
        :param lda_filename: File name of a previously trained model
        """
        super().__init__(size)

        # Check if we have already trained the Lda model
        if lda_filename is not None and os.path.exists(lda_filename):
            self.lda = LdaModel.load(lda_filename)
            logging.info("LDA model loaded")
        else:
            if corpus is None:
                raise ValueError("Corpus must be provided to train LDA")

            self.lda = LdaModel(corpus=corpus,
                                id2word=dictionary,
                                num_topics=size,
                                passes=1,
                                decay=decay,
                                minimum_probability=0.0)

    def update(self, documents):
        """
        Update model using documents.

        :param documents: The new documents used for update
        """
        self.lda.update(documents)

    def save(self, filename: str):
        """
        Save model to a file.

        :param filename: A model file name
        """
        self.lda.save(filename)

    def _get_vector_representation(self, items):
        """
        Represent documents as vectors.

        :param items: A list of documents
        :return: A list of feature vectors.
        """
        return self.lda[items]
예제 #27
0
def get_topics(data, filepath='./data/spam_topics.pkl'):
    if not os.path.exists(filepath):
        import pyLDAvis.gensim
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel, CoherenceModel

        texts = [sample['lemmas'] for sample in data]

        dictionary = Dictionary(texts)
        dictionary.filter_extremes(no_below=20, no_above=0.4)
        corpus = [dictionary.doc2bow(text) for text in texts]

        chunksize = 500
        passes = 5
        iterations = 400
        eval_every = None

        temp = dictionary[0]  # This is only to "load" the dictionary.
        id2word = dictionary.id2token

        best_coherence = 0
        best_model_filepath = ''
        for num_topics in list(range(2, 20)):
            for alpha in ['asymmetric', 'symmetric']:
                for eta in ['symmetric', 'auto']:
                    filepath = 'out/topics/{}_{}_{}'.format(num_topics, alpha, eta)
                    model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, alpha='auto', eta='auto', iterations=iterations, num_topics=num_topics, passes=passes, eval_every=eval_every)
                    coherence = float(CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence())
                    filepath += '_{:.4f}'.format(coherence)
                    model.save(filepath + '_model.pkl')

                    prepared = pyLDAvis.gensim.prepare(model, corpus, dictionary)
                    pyLDAvis.save_html(prepared, filepath + '_plot.html')

                    if coherence > best_coherence:
                        best_coherence = coherence
                        best_model_filepath = filepath + '_model.pkl'

        model = LdaModel.load(best_model_filepath)
        print('Best model: {}'.format(best_model_filepath))

        topics = [x[0] for x in model.top_topics(corpus=corpus, texts=texts, dictionary=dictionary, topn=100)]

        data_topics = []
        for i, text in enumerate(texts):
            data_topics.append({k: v for k, v in model.get_document_topics(dictionary.doc2bow(text), minimum_probability=0.0)})

        pickle.dump([topics, data_topics], open(filepath, 'wb'))
    else:
        [topics, data_topics] = pickle.load(open(filepath, 'rb'))

    for i in range(len(data_topics)):
        data[i]['topics'] = data_topics[i]

    return topics, data
def save_model(model_path):
    train_set = get_train_set()
    # 构建训练语料
    dictionary = Dictionary(train_set)
    corpus = [dictionary.doc2bow(text) for text in train_set]

    # lda模型训练
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    lda.print_topics(100)

    lda.save(model_path)
def build_topic(df, load_existing=True):

    words = ut.get_text_array(df)
    dictionary = corpora.Dictionary(words)
    if load_existing and os.path.exists('lda_model.h5'):
        model = LdaModel.load('lda_model.h5')
        return model, dictionary

    corpus = [dictionary.doc2bow(text) for text in words]
    model = LdaModel(corpus, num_topics=NUM_TOPICS)
    model.save('lda_model.h5')
    return model, dictionary
예제 #30
0
    def train_model(self, num_topics):
        corpus = self.get_corpus()
        model = LdaModel(corpus,
                         chunksize=2000,
                         passes=20,
                         iterations=200,
                         num_topics=num_topics,
                         eval_every=None)
        tmp_fname = self.path + "lda.model"
        model.save(tmp_fname)

        return model
def create_model(config, Kind):
    model_fname = config.model_fname % Kind.__name__
    corpus_fname = config.corpus_fname % Kind.__name__

    if not os.path.exists(model_fname):
        try:
            id2word = Dictionary.load(corpus_fname + '.dict')
            corpus = MalletCorpus(corpus_fname, id2word=id2word)
            logger.info('Opened previously created corpus: %s' % corpus_fname)
        except:
            error('Corpora for building file models not found!')

        file_model = LdaModel(corpus,
                              id2word=corpus.id2word,
                              alpha=config.alpha,
                              passes=config.passes,
                              num_topics=config.num_topics)

        file_model.save(model_fname)
예제 #32
0
파일: news.py 프로젝트: xialei/poc
def lda():
    # remove stop words
    stopwords = codecs.open('../conf/stop_words_ch.txt', mode='r', encoding='utf8').readlines()
    stopwords = [ w.strip() for w in stopwords ]
    
    fp = codecs.open('D:\\nlp\corpora\segs.txt', mode='r', encoding='utf8')
    train = []
    for line in fp:
        line = line.split()
        train.append([ w for w in line if w not in stopwords ])
    
    dictionary = corpora.Dictionary(train)
    corpus = [ dictionary.doc2bow(text) for text in train ]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=100)
    
    lda.print_topics(30)
    # print topic id=20
    lda.print_topic(20)
    
    # save/load model
    lda.save('D:\\nlp\corpora\news.model')
        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform PLSA
    logging.info('perform PLSA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, chunksize=chunksize, alpha=1., eta=1.)
        lda.save('result/model_wiki.plsa')
        lda.print_topics(topics=num_topics, topn=10)
    else:
        lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, chunksize=chunksize, alpha=1., eta=1.)
        lda.save('result/model.plsa')
        lda.print_topics(topics=num_topics, topn=10)
예제 #34
0
파일: dmp.py 프로젝트: npiaq/dmp
class DMP(object):

    def __init__(self):
        self.dic = None
        self.lda = None
        self.topic_num = config.getint('dmp', 'topic_num')
        self.corpus_file = config.get('dmp', 'corpus_file')

    @staticmethod
    def __text2doc(iterator, sep=u' '):
        '''将文本转换为文档
        通过 split 函数将文本切成词的列表.

        参数
            sep: 分隔符

        返回
            返回已经切割好的词的列表
        '''
        docs = []
        for line in iterator:
            text = line.strip().split(sep)
            docs.append(text)
        return docs

    def __load_corpus(self):
        '''读取语料. 通过调用 text2doc 将文本转换为词的列表.

        返回
            返回处理过后的文档的列表.
        '''
        docs = None
        with codecs.open(self.corpus_file, 'r', 'utf-8') as iterator:
            docs = self.__text2doc(iterator)
        return docs

    def train(self):
        '''训练模型, 将会得到词典 (dic) 和模型 (lda) 两个对象.

        dic: 用来存储词, 每个词会有一个编号. 可以通过 dic[id] 来获取词
        lda: 模型, 包含主题的列表. 每个主题有一个编号, 可以通过
             lda.print_topic(id) 来获取主题中词的列表
        '''
        docs = self.__load_corpus()
        self.dic = Dictionary(docs)
        bow = [self.dic.doc2bow(doc) for doc in docs]
        self.lda = LdaModel(bow, id2word=self.dic,
                            num_topics=self.topic_num)

    def infer(self, doc):
        '''推断新的文档是什么主题

        参数
            doc: 新的文档. 要以词的列表的形式呈现

        返回
            返回主题列表的迭代器, 其中主题均采用编号呈现, 需调用 lda.print_topic
            函数来方便人工理解.
        '''
        bow = self.dic.doc2bow(doc)
        topics = self.lda[bow]
        return topics

    def dump(self):
        '''导出 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda.save(lda_file)
        self.dic.save(dic_file)

    def load(self):
        '''读取 lda 模型和 dic 词典.
        '''
        lda_file = config.get('dmp', 'lda_file')
        dic_file = config.get('dmp', 'dic_file')
        self.lda = LdaModel.load(lda_file)
        self.dic = Dictionary.load(dic_file)
        #
        # logging.info('combine report and wiki dictionary...')
        # wiki_to_report = report_dict.merge_with(wiki_dict)
        # merged_dict = report_dict
        #
        # logging.info('combine report and wiki corpus...')
        # merged_corpus = wiki_to_report[wiki_corpus].corpus + report_corpus
        logging.info('generate wiki corpus...')
        wiki_txt = unpickle('data/txt/processed_wiki.pkl')
        wiki_corpus = [report_dict.doc2bow(wiki) for wiki in wiki_txt]

        logging.info('combine report and wiki corpus...')
        merged_corpus = wiki_corpus + report_corpus

    # compute TFIDF
    # logging.info('compute TFIDF...')
    # tfidf = TfidfModel(dictionary=report_dict, id2word=report_dict)

    # perform LDA
    logging.info('perform LDA...')
    if use_wiki is True:
        lda = LdaModel(corpus=merged_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model_wiki.lda')
        lda.print_topics(topics=num_topics, topn=10)
    else:
        lda = LdaModel(corpus=report_corpus, id2word=report_dict, num_topics=num_topics, passes=passes,
                       iterations=iterations, alpha='auto', chunksize=chunksize)
        lda.save('result/model.lda')
        lda.print_topics(topics=num_topics, topn=10)
예제 #36
0
def create_lda_model():
    logging.info('about to create all docs from chunks')
    start_time = datetime.datetime.now()
    create_all_docs()
    end_time = datetime.datetime.now()
    logging.info('total time is: %s', end_time - start_time)

    logging.info('about to load all docs')
    with open('./resources/LDA_processing/all_docs.pkl', mode='rb') as f:
        all_docs = pickle.load(f)

    logging.info('about to load english words')
    with open('./resources/LDA_input/english_full_list.txt') as f:
        english_words = f.read().splitlines()

    good_english_words = set(english_words[75:21000])
    del english_words
    logging.info('about to remove all stop-words and unknown words')
    texts = []
    for i, doc in enumerate(all_docs):
        filtered_doc = [word for word in doc if word in good_english_words]
        texts.append(filtered_doc)
        if i % 5000 == 0:
            logging.info('Finished doc: %s', i)

    logging.info('about to release memory of all_docs and english_words')
    del all_docs
    del good_english_words

    logging.info('about to save texts')
    with open('./resources/LDA_processing/texts.pkl', mode='wb') as f:
        pickle.dump(texts, f)

    logging.info('about to load texts')
    with open('./resources/LDA_processing/texts.pkl', mode='rb') as f:
        texts = pickle.load(f)

    logging.info('about to create dictionary')
    dictionary = corpora.Dictionary(texts)
    keys = dictionary.keys()
    logging.info('dict size before filter: %s', len(keys))
    dictionary.filter_extremes(keep_n=150000)
    dictionary.filter_extremes(no_below=150, no_above=0.05)
    keys = dictionary.keys()
    logging.info('dict size after filter: %s', len(keys))
    dictionary.save('./resources/LDA_processing/lda.dict')
    dictionary.save_as_text('./resources/LDA_processing/lda_dict.txt')

    logging.info('about to create corpus')
    corpus = [dictionary.doc2bow(text) for text in texts]

    logging.info('about to save corpus as mm file')
    corpora.MmCorpus.serialize('./resources/LDA_processing/corpus.mm', corpus)

    logging.info('about to load dictionary file')
    dictionary = corpora.Dictionary.load('./resources/LDA_processing/lda.dict')

    logging.info('about to load corpus as mm file')
    corpus = corpora.MmCorpus('./resources/LDA_processing/corpus.mm')

    logging.info('about to start LDA model')
    lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)
    logging.info('finished LDA model')

    logging.info('about to save ldaModel')
    lda.save('./resources/LDA_processing/LdaModel')

    logging.info('about to load ldaModel')
    lda = LdaModel.load('./resources/LDA_processing/LdaModel')

    logging.info('about to find topics')
    topics = lda.show_topics(num_topics=num_topics, num_words=10000, log=True, formatted=False)

    logging.info('about to save topics')
    with open('./resources/LDA_processing/topics.pkl', mode='wb') as f:
        pickle.dump(topics, f)

    dict_word_sets = find_words_from_lda_model()
    with open('./resources/LDA_processing/dict_word_sets.pkl', mode='wb') as f:
        pickle.dump(dict_word_sets, f)

    topics_words = extract_words_from_word_sets()
    with open('./resources/LDA_result/topic_words', mode='wt', encoding='utf-8') as f:
        f.write('\n'.join(topics_words))
예제 #37
0
print("fitting the model ...\n")

model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

#model = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
#                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

print(model, "\n")

topics = model.show_topics(num_topics=no_of_topics)

for item, i in zip(topics, enumerate(topics)):
    print("topic #"+str(i[0])+": "+str(item)+"\n")


print("saving ...\n")

if not os.path.exists("out"): os.makedirs("out")

with open("out/"+foldername+"_doclabels.txt", "w") as f:
    for item in doc_labels: f.write(item+"\n")

with open("out/"+foldername+"_topics.txt", "w") as f:
    for item, i in zip(topics, enumerate(topics)):
        f.write("topic #"+str(i[0])+": "+str(item)+"\n")

dictionary.save("out/"+foldername+".dict")
MmCorpus.serialize("out/"+foldername+".mm", corpus)
model.save("out/"+foldername+".lda")
def main():
    '''
    Runs cuisine similarity and clustering analysis according to default args at top of file
    '''
    
    # Select cuisines based on random sample
    print("Selecting cuisines...")
    start = clock()
    cuisine_files = [file for file in os.listdir(CUISINE_DIRECTORY) if os.path.isfile(os.path.join(CUISINE_DIRECTORY, file))]
    if os.path.isfile(CUISINE_IGNORE):
        cuisine_types = []
        with open(CUISINE_IGNORE, "rt") as f:
            for line in f:
                cuisine_types.append(line.rstrip())
        cuisine_files = list(set(cuisine_files) - set(cuisine_types))
    random.seed(RSEED)
    random.shuffle(cuisine_files)
    cuisine_files = cuisine_files[:MAX_CUISINES]
    num_cuisines = len(cuisine_files)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # process reviews for each category, retaining only those with ratings in stars
    # and calculate total document word lengths for each cluster (prior to processing)
    print("Processing cuisine reviews...")
    start = clock()
    doc_lens = []
    for filename in cuisine_files:
        doclen = 0
        with open(os.path.join(CUISINE_DIRECTORY, filename), "rt") as f:
            with open(os.path.join(CUISINE_DIRECTORY, "processed", filename), "wt") as g:            
                for line in f:
                    if int(line[RATING_INDEX]) in STARS:
                        info = line[:REVIEW_INDEX]
                        line = line[REVIEW_INDEX:]
                        doclen += len(line.split())
                        line = process_document(line)
                        g.write(info + line + "\n")
        print("Processing of %s is complete!" % (filename,))
        doc_lens.append(doclen)
    with open("cuisine_selected.txt", "wt") as f:
        for i in range(len(cuisine_files)):
            f.write(",".join([cuisine_files[i], str(doc_lens[i])]) + "\n")
    finish = clock()
    print("Processing of reviews is complete!")
    print("Running time: %.2f seconds" % (finish - start,))
    print()

    # Build corpus from selected cuisine documents
    print("Building cuisine document corpus...")
    start = clock()
    cuisine_corpus = MyCorpus(cuisine_files, os.path.join(CUISINE_DIRECTORY, "processed"))
    with open("cuisine_corpus.pyobject", "wb") as f:
        pickle.dump(cuisine_corpus, f)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()

    # create Tfidf-Model
    print("Running TFIDF model on cuisine corpus...")
    start = clock()
    cuisine_corpus.agglomerate = False
    cuisine_tfidf = TfidfModel(corpus = cuisine_corpus, id2word = cuisine_corpus.dictionary, \
                               wlocal = tf_func, wglobal = idf_func, normalize = False)
    with open("cuisine_tfidf.pyobject", "wb") as f:
        pickle.dump(cuisine_tfidf, f)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # create list of doc sparse vectors and perform document length normalization
    print("Calculating similarity matrices and writing to files...")
    start = clock()
    cuisine_corpus.agglomerate = True
    cuisine_types = list(map(lambda x: x.split(".")[0], cuisine_corpus.file_list))
    vec_len = len(cuisine_corpus.dictionary.keys())
    avgl = sum(doc_lens) / len(doc_lens)
    doc_sparse_list = []
    for index, doc in enumerate(cuisine_corpus):
        vec = SparseVector(cuisine_tfidf[doc], vec_len)
        vec = vec / (1 - DOCLEN_NORM_B + DOCLEN_NORM_B * doc_lens[index] / avgl)
        doc_sparse_list.append((cuisine_types[index], vec))      
    
    # calculate similarity for all clusters and write to file
    dat_cs = np.zeros((len(cuisine_types), len(cuisine_types)))
    dat_ts = np.zeros((len(cuisine_types), len(cuisine_types)))
    dat_js = np.zeros((len(cuisine_types), len(cuisine_types)))
    for ind1 in range(len(doc_sparse_list)):
        for ind2 in range(ind1, len(doc_sparse_list)):
            csimvalue = doc_sparse_list[ind1][1].cosine_similarity(doc_sparse_list[ind2][1])
            tsimvalue = doc_sparse_list[ind1][1].tanimoto_similarity(doc_sparse_list[ind2][1])
            jsimvalue = doc_sparse_list[ind1][1].jaccard_similarity(doc_sparse_list[ind2][1])
            dat_cs[ind1][ind2] = csimvalue
            dat_ts[ind1][ind2] = tsimvalue
            dat_js[ind1][ind2] = jsimvalue
            if ind1 != ind2:
                dat_cs[ind2][ind1] = csimvalue
                dat_ts[ind2][ind1] = tsimvalue
                dat_js[ind2][ind1] = jsimvalue
    cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types)
    ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types)
    js_df = pd.DataFrame(dat_js, index = cuisine_types, columns = cuisine_types)
    cs_df.to_csv("cosine_similarity_df.csv", header = True, index = True)
    ts_df.to_csv("tanimoto_similarity_df.csv", header = True, index = True)
    js_df.to_csv("jaccard_similarity_df.csv", header = True, index = True)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # clear potential large objects from memory prior to running any further analyses
    del doc_sparse_list
    del dat_cs
    del dat_ts
    del dat_js
    del cs_df
    del ts_df
    del js_df
   
    # run lda analysis
    print("Running LDA with %d topics..." % (LDA_TOPICS,))
    start = clock()
    cuisine_corpus.agglomerate = False
    lda = LdaModel(corpus = cuisine_tfidf[cuisine_corpus], id2word = cuisine_corpus.dictionary, \
                   num_topics = LDA_TOPICS, eval_every = None, chunksize = LDA_CHUNKSIZE, iterations = 200, \
                   passes = 2)
    lda.save("lda_cuisines.pyobject")
    
    # create dense numpy matrix
    cuisine_corpus.agglomerate = True
    rows, cols = len(cuisine_files), LDA_TOPICS
    lda_array = np.zeros(rows * cols).reshape(rows, cols)
    for row, doc in enumerate(cuisine_corpus):
        entries = lda[doc]
        for col, value in entries:
            lda_array[row][col] = value
    with open("lda_array.npy", "wb") as f:
        np.save(f, lda_array)
    finish = clock()
    print("LDA complete!")
    print("Running time: %.2f seconds" % (finish - start,))
    print()
   
    # calculate similarity for all lda documents and write to file
    print("Calculating LDA similarity matrices...")
    start = clock()
    dat_cs = np.zeros((len(cuisine_types), len(cuisine_types)))
    dat_ts = np.zeros((len(cuisine_types), len(cuisine_types)))
    for ind1 in range(lda_array.shape[0]):
        vec1 = lda_array[ind1,:]
        for ind2 in range(ind1, lda_array.shape[0]):
            vec2 = lda_array[ind2,:]
            csimvalue = vec1.dot(vec2) / np.sqrt(vec1.dot(vec1) * vec2.dot(vec2))
            tsimvalue = vec1.dot(vec2) / (vec1.dot(vec1) + vec2.dot(vec2) - vec1.dot(vec2))
            dat_cs[ind1][ind2] = csimvalue
            dat_ts[ind1][ind2] = tsimvalue
            if ind1 != ind2:
                dat_cs[ind2][ind1] = csimvalue
                dat_ts[ind2][ind1] = tsimvalue
    cs_df = pd.DataFrame(dat_cs, index = cuisine_types, columns = cuisine_types)
    ts_df = pd.DataFrame(dat_ts, index = cuisine_types, columns = cuisine_types)
    cs_df.to_csv("lda_cosine_similarity_df.csv", header = True, index = True)
    ts_df.to_csv("lda_tanimoto_similarity_df.csv", header = True, index = True)
    finish = clock()
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # clear up memory        
    del dat_cs
    del dat_ts
    del cs_df
    del ts_df
    
    # perform k-means clustering analysis on 50 clusters using 5-fold with penalty coefficient
    start = clock()
    cv_folds = np.tile(np.arange(10), np.ceil(len(cuisine_files) / 10))
    np.random.seed(RSEED)
    np.random.shuffle(cv_folds)
    kmeans_results = []
    for n_clusters in range(1, int(len(cuisine_files) - np.ceil(len(cuisine_files) / 10) + 1)):
        print("Analyzing for %d cluster(s)..." % (n_clusters,))
        penalty = len(cuisine_files) / (len(cuisine_files) - n_clusters + 1)
        total_ssr = 0
        for i in range(10):
            train_index = np.where(cv_folds != i)[0]
            test_index = np.where(cv_folds == i)[0]
            kmeans_model = KMeans(n_clusters)
            kmeans_model.fit(lda_array[train_index,:])
            total_ssr += np.sum(np.min(kmeans_model.transform(lda_array[test_index,:]), axis = 1)**2)
        kmeans_results.append((n_clusters, total_ssr * penalty))
    with open("kmeans_results.txt", "wt") as f:
        for tup in kmeans_results:
            f.write(str(tup[0]) + "," + str(tup[1]) + "\n")
    finish = clock()
    print("Cross-Validation analysis complete!")
    print("Running time: %.2f seconds" % (finish - start,))
    print()
    
    # fit final model
    sel_clusters = min(kmeans_results, key = lambda x: x[1])[0]
    print("Fitting final optimal kmeans model...")
    print("Minimum occurs at %d clusters." % (sel_clusters,))
    kmeans_final = KMeans(sel_clusters)
    final_clusters = np.argmin(kmeans_final.fit_transform(lda_array), axis = 1)
    clusters = {key:[] for key in range(sel_clusters)}
    for index, cuisine in enumerate(cuisine_types):
        clusters[final_clusters[index]].append(cuisine)
    with open("optimal_clusters.txt", "wt") as f:
        for i in range(sel_clusters):
            f.write(",".join(clusters[i]) + "\n")
    print()
            
    # fit model with 20 clusters
    sel_clusters = 20 if len(cuisine_files) > 20 else len(cuisine_files)
    print("Fitting kmeans model with %d clusters..." % (sel_clusters,))
    kmeans_final = KMeans(sel_clusters, n_init = 100)
    final_clusters = np.argmin(kmeans_final.fit_transform(lda_array), axis = 1)
    clusters = {key:[] for key in range(sel_clusters)}
    for index, cuisine in enumerate(cuisine_types):
        clusters[final_clusters[index]].append(cuisine)
    with open("many_clusters.txt", "wt") as f:
        for i in range(sel_clusters):
            f.write(",".join(clusters[i]) + "\n")
    print()
예제 #39
0
파일: lda2.py 프로젝트: pielstroem/Topics
log.info('generated topics...')

# print topics
topics = model.show_topics(num_topics=no_of_topics)

for item, i in zip(topics, enumerate(topics)):
    log.info('topic #%s: %s', i[0], item)


log.info('saving results...')

# create output folder
if not os.path.exists("out"): os.makedirs("out")

# save doc_labels for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"),''.join([foldername, "_doclabels.txt"])), "w", encoding="utf-8") as f:
    for item in doc_labels: f.write(item+"\n")
	
# save topics for further use
with open(os.path.join(os.path.join(os.getcwd(), "out"), ''.join([foldername, "_topics.txt"])), "w", encoding="utf-8") as f:
    for item, i in zip(topics, enumerate(topics)):
        f.write("".join(["topic #",str(i[0]),": ",str(item),"\n"]))

# save dictionary for further use
dictionary.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'dict'])))

# save model for further use
model.save(os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'lda'])))

log.info('topic modeling finished')
예제 #40
0
class W2V_cpp2(W2V_base):
    def __init__(self,n_topic, path, folder):
        self.n_topic = n_topic
        W2V_base.__init__(self, path, folder)

        #process dict
        for prod_id in self.idx2prod.keys():
            prod = self.idx2prod[prod_id]
            n_prod_id = prod_id - len(self.word_count) - 1
            del self.idx2prod[prod_id]
            self.idx2prod[n_prod_id] = prod
            self.prod2idx[prod] = n_prod_id

        for user_id in self.idx2user.keys():
            user = self.idx2user[user_id]
            n_user_id = user_id - len(self.word_count) - len(self.prod2idx) - 1
            del self.idx2user[user_id]
            self.idx2user[n_user_id] = user
            self.user2idx[user] = n_user_id

    def train(self):
        data = []
        entity2id = {}
        id2entity = []

        for obj in self.data:
            doc = []
            obj_sents = obj["text_data"]
            entity = obj["prod"]
            if entity not in entity2id:
                entity2id[entity] = len(entity2id)
                id2entity.append(entity)
            doc_id = entity2id[entity]

            for obj_sent in obj_sents:
                for pair in obj_sent:
                    if pair[0] >= 0:
                        doc.append((pair[0], doc_id))
            data.append(doc)



        self.ldamodel = LdaModel(corpus=data, id2word=self.idx2word, num_topics=self.n_topic)

        f_entity = open("lda/prod.txt", "w")
        f_model = open("lda/model.txt", "w")
        f_model.write(str(len(entity2id)))
        f_model.write(" ")
        f_model.write(str(self.n_topic))
        f_model.write("\n")

        for entity in id2entity:
            f_entity.write(entity)
            f_entity.write("\n")

            f_model.write(entity)
            f_model.write(" ")

            distr = self.ldamodel.get_document_topics(data[1], minimum_phi_value=0, minimum_probability=0)
            distr = [pair[1] for pair in distr]

            for prod in distr:
                f_model.write(str(prod))
                f_model.write(" ")

            f_model.write("\n")

        self.ldamodel.save("lda/model_200")
예제 #41
0
파일: docs.py 프로젝트: rafunchik/shrimps
# try with BoW vectors too?



#  vamos a utilizar Latent Dirichlet Allocation para tratar de categorizar los abstracts
# este se demora la primera q lo corres para entrenar el modelo
print("lda")
lda_filename = 'model.lda'
if not os.path.isfile(lda_filename):
    lda = LdaModel(corpus, num_topics=5,
                   id2word=dictionary,
                   update_every=5,
                   chunksize=10000,
                   passes=100)
    lda.save('/tmp/model.lda')
else:
    lda = LdaModel.load('/tmp/model.lda')
lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=7)

print(topics_matrix)
print(len(topics_matrix))

for topic in topics_matrix:
    i = topic[1]
    print([str(word) for word in i])
#
# topics_matrix = np.array(topics_matrix)
#
# topic_words = topics_matrix[:, :, 1]
예제 #42
0
파일: demo.py 프로젝트: pielstroem/Topics
def upload_file():
    """
    Upload csv files and create:
        * ~/out/corpus.dict
        * ~/out/corpus.lda
        * ~/out/corpus.lda.state
        * ~/out/corpus.mm
        * ~/out/corpus.mm.index
        * ~/out/corpus_doclabels.txt
        * ~/out/corpus_topics.txt
        * ~/mycorpus.txt

    As well as (for example):
        * ~/swcorp/Doyle_AStudyinScarlet.txt
        * ~/swcorp/Lovecraft_AttheMountainofMadness.txt
        * etc.
    """

    # INPUT
    # columns to read from csv file
    columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

    # parts-of-speech to include into the model
    pos_tags = ['ADJ', 'NN', 'V']

    # stopwords
    regex = re.compile('\w+')
    stopwords = request.files['stoplist']
    stopwords = str(stopwords.readlines())
    stopwords = regex.findall(stopwords)
    stopwords.extend(("'", "'d", "'s")) # temporary solution
    print(stopwords)

    # document size (in words)
    doc_size = 1000

    # uses the pipeline's ParagraphId to split text into documents,
    # overrides doc_size - 1: on, 0: off
    doc_split = 0

    # no. of topics to be generated
    no_of_topics = 30

    # no. of lda iterations - usually, the more the better, but
    # increases computing time
    no_of_passes = 1

    # perplexity estimation every n chunks -
    # the smaller the better, but increases computing time
    eval = 1

    # documents to process at once
    chunk = 100

    # "symmetric", "asymmetric", "auto", or array
    # (default: a symmetric 1.0/num_topics prior) affects sparsity of
    # the document-topic (theta) distribution
    alpha = "symmetric"

    # custom alpha may increase topic coherence, but may also produce
    # more topics with zero probability alpha = np.array([ 0.02, 0.02,
    # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04,
    # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02])

    # can be a number (int/float), an array, or None
    # affects topic-word (lambda) distribution - not necessarily
    # beneficial to topic coherence
    eta = None

    # PREPROCESSING
    files = request.files.getlist('files')
    docs = []
    doc_labels = []

    print("\n reading files ...\n")

    for file in files:
        file_label = secure_filename(file.filename).split('.')[0]

        df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
        df = df[columns]
        df = df.groupby('CPOS')

        doc = pd.DataFrame()
        for p in pos_tags:  # collect only the specified parts-of-speech
            doc = doc.append(df.get_group(p))
            # construct documents
            if doc_split:  # size according to paragraph id
                doc = doc.groupby('ParagraphId')
                for para_id, para in doc:
                    docs.append(para['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(para_id)]))
            else:  # size according to doc_size
                doc = doc.sort_values(by='TokenId')
                i = 1
                while(doc_size < doc.shape[0]):
                    docs.append(
                        doc[:doc_size]['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(i)]))
                    doc = doc.drop(doc.index[:doc_size])
                    i += 1
                docs.append(doc['Lemma'].values.astype(str))
                doc_labels.append(''.join([file_label, " #", str(i)]))

            if not os.path.exists(os.path.join(os.getcwd(), "swcorp")):
                os.makedirs(os.path.join(os.getcwd(), "swcorp"))

            swpath = os.path.join('swcorp', "".join(file_label))

            with open(swpath + ".txt", 'w', encoding="utf-8") as text:
                text.write(" ".join(
                    word for word in doc['Lemma'].values.astype(str)
                    if word not in stopwords))

    print("\n normalizing and vectorizing ...\n")

    # texts = [
    #   [word for word in doc if word not in stopwords] for doc in docs]

    print("\n stopwords removed ...\n")

    print("\n writing mastercorpus ...\n")

    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    with open(mastercorpus, 'w', encoding="utf-8") as data:
        folder = glob.glob("swcorp/*")
        for text in folder:
            with open(text, 'r', encoding="utf-8") as text:
                textline = [re.sub(
                    r'\\n\\r', '', document) for document in ' '.join(
                        text.read().split())]
                if text != folder[-1]:
                    data.write("".join(textline) + "\n")
                else:
                    data.write("".join(textline))

    # MAIN PART
    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    dictionary = corpora.Dictionary(
        line.lower().split() for line in open(
            mastercorpus, encoding="utf-8"))

    class MyCorpus(object):
        def __iter__(self):
            for line in open('mycorpus.txt'):
                # assume there's one document per line, tokens
                # separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

    # corpus = buildCorpus(mastercorpus, dictionary)

    corpus = MyCorpus()

    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
예제 #43
0
    vocab = Dictionary.load_from_text('./vocab.txt')
    corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
    valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
    valid_sentences = [doc for doc in valid_corpus][5000:]

    # varing number of topics
    # result = {}
    # for num_topics in [2, 4, 8, 16, 32, 64]:
    #     best_value = -100
    #     for i in range(5):
    #         model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
    #         likelihood = model.log_perplexity(valid_sentences)
    #         best_value = max(best_value, likelihood)
    #     result[num_topics]= best_value
    #
    # for num_topics, likelihood in result.iteritems():
    #     print 'num_topics: %d, best word_likelihood: %f' % (num_topics, likelihood)

    model = LdaModel(corpus=corpus, id2word=vocab, num_topics=8, passes=2)
    model.save('./lda_model.txt')
    # print topics to a file
    topics = model.show_topics(num_topics=100, num_words=50)
    with codecs.open('./topics.txt', 'w', 'utf-8') as out_f:
        for topic in topics:
            topic_id, topic_str = topic[0], topic[1]
            out_f.write('%d:\n%s\n' % (topic_id, topic_str))
        out_f.write('\n')



예제 #44
0
print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity




예제 #45
0
    def run(lda_model_path, corpus_path, num_topics, id2word):
        corpus = corpora.BleiCorpus(corpus_path)
        lda = LdaModel(corpus, num_topics=num_topics, id2word=id2word)
        lda.save(lda_model_path)

        return lda