Exemplo n.º 1
0
def train(data, valid_h1, valid_h2, vocab):

    #logging.basicConfig(filename=args.save_path + 'lda.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = LdaModel(id2word=vocab,
                     num_topics=args.topics,
                     random_state=0,
                     chunksize=args.batch_size,
                     update_every=args.batch_size,
                     alpha='auto',
                     eta=None,
                     decay=args.decay,
                     offset=args.offset,
                     per_word_topics=True)

    best_perplexity = float('inf')

    for epoch in range(args.epochs):

        model.update(data, passes=1, eval_every=1, gamma_threshold=0.001)
        print("Epoch number {}".format(epoch), end=' ')

        val_perplexity = evaluate(data, valid_h1, valid_h2, model, 'valid')
        if val_perplexity < best_perplexity:
            best_perplexity = val_perplexity
            model.save(os.path.join(args.save_path, 'model.ckpt'))
Exemplo n.º 2
0
class TestLdaCallback(unittest.TestCase):

    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

    def testCallbackUpdateGraph(self):

        # Popen have no context-manager in 2.7, for this reason - try/finally.
        try:
            # spawn visdom.server
            proc = subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)])

            # wait for visdom server startup (any better way?)
            time.sleep(3)

            viz = Visdom(server=self.host, port=self.port)
            assert viz.check_connection()

            # clear screen
            viz.close()

            self.model.update(self.corpus)
        finally:
            proc.kill()
Exemplo n.º 3
0
class Lda(ModelABC):
    """Represent news articles as vectors using Latent Dirichlet Allocation."""
    def __init__(self,
                 dictionary: Dictionary,
                 corpus=None,
                 size: int = 100,
                 decay=0.5,
                 lda_filename: str = None):
        """
        :param dictionary: A dictionary
        :param corpus: A corpus for training
        :param size: The length of feature vector
        :param decay: The decay parameter
        :param lda_filename: File name of a previously trained model
        """
        super().__init__(size)

        # Check if we have already trained the Lda model
        if lda_filename is not None and os.path.exists(lda_filename):
            self.lda = LdaModel.load(lda_filename)
            logging.info("LDA model loaded")
        else:
            if corpus is None:
                raise ValueError("Corpus must be provided to train LDA")

            self.lda = LdaModel(corpus=corpus,
                                id2word=dictionary,
                                num_topics=size,
                                passes=1,
                                decay=decay,
                                minimum_probability=0.0)

    def update(self, documents):
        """
        Update model using documents.

        :param documents: The new documents used for update
        """
        self.lda.update(documents)

    def save(self, filename: str):
        """
        Save model to a file.

        :param filename: A model file name
        """
        self.lda.save(filename)

    def _get_vector_representation(self, items):
        """
        Represent documents as vectors.

        :param items: A list of documents
        :return: A list of feature vectors.
        """
        return self.lda[items]
Exemplo n.º 4
0
class LDAModel(Model, Transformer):
    def __init__(self, corpus=None, **kwargs):
        self._m = LdaModel(corpus, **kwargs)

    def fit(self, corpus):
        self._m.update(corpus)

    def transform(self, corpus):
        return self._m[corpus]

    @property
    def inst(self):
        return self._m
Exemplo n.º 5
0
class LDA(GenericModel):
    """
    Wrapper for Gensim LdaModel and LdaMulticore
    """
    def __init__(self, *args, **kwargs):
        """
        All provided arguments will be passed to LdaModel or
        LdaMulticore constructors (the latter in case 'workers'
        is present in keyword arguments)

        :param args: positional arguments to initialize model with
        :param kwargs: keyword arguments to pass to model constructor
        """
        if 'workers' in kwargs.keys():
            self.__model__ = LdaMulticore(*args, **kwargs)
        else:
            self.__model__ = LdaModel(*args, **kwargs)

    def fit(self, data: Any, *args, **kwargs):
        # Actually, I think there is no need for this as
        # we can simply use update() for uninitialized model
        self.__model__.update(corpus=data, *args, **kwargs)

    def update(self, data: Any, *args, **kwargs):
        self.__model__.update(corpus=data, *args, **kwargs)

    def get_topics(self,
                   docs: Optional[Iterable[Any]] = None,
                   *args,
                   **kwargs):
        if docs is None:
            topics = self.__model__.show_topics(formatted=False,
                                                *args,
                                                **kwargs)
        else:
            topics = map(
                partial(self.__model__.get_document_topics,
                        per_word_topics=True), docs)
        topics, t_copy, t_copy_1 = tee(topics, 3)

        ids = map(lambda x: x[0], topics)
        words = map(lambda x: x[1], t_copy)
        words = map(lambda x: list(zip(*x))[0], words)
        scores = map(lambda x: x[1], t_copy_1)
        scores = map(lambda x: list(zip(*x))[1], scores)

        topics = zip(ids, zip(words, scores))

        return topics
Exemplo n.º 6
0
class TestLdaCallback(unittest.TestCase):

    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus, coherence="u_mass", logger="visdom", title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary, num_topics=2, passes=10, callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

    def testCallbackUpdateGraph(self):
        with subprocess.Popen(['python', '-m', 'visdom.server', '-port', str(self.port)]) as proc:
            # wait for visdom server startup (any better way?)
            viz = Visdom(server=self.host, port=self.port)
            for attempt in range(5):
                time.sleep(1.0)  # seconds
                if viz.check_connection():
                    break
            assert viz.check_connection()
            viz.close()
            self.model.update(self.corpus)
            proc.kill()
class TestLdaCallback(unittest.TestCase):
    def setUp(self):
        self.corpus = MmCorpus(datapath('testcorpus.mm'))
        self.ch_umass = CoherenceMetric(corpus=self.corpus,
                                        coherence="u_mass",
                                        logger="visdom",
                                        title="Coherence")
        self.callback = [self.ch_umass]
        self.model = LdaModel(id2word=common_dictionary,
                              num_topics=2,
                              passes=10,
                              callbacks=self.callback)

        self.host = "http://localhost"
        self.port = 8097

    def testCallbackUpdateGraph(self):

        # Popen have no context-manager in 2.7, for this reason - try/finally.
        try:
            # spawn visdom.server
            proc = subprocess.Popen(
                ['python', '-m', 'visdom.server', '-port',
                 str(self.port)])

            # wait for visdom server startup (any better way?)
            time.sleep(3)

            viz = Visdom(server=self.host, port=self.port)
            assert viz.check_connection()

            # clear screen
            viz.close()

            self.model.update(self.corpus)
        finally:
            proc.kill()
Exemplo n.º 8
0
@author: Bokkin Wang
"""

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=10)

# Save model to disk.
temp_file = datapath("model")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

# Create a new corpus, made of previously unseen documents.
other_texts = [['computer', 'time', 'graph'], ['survey', 'response', 'eps'],
               ['human', 'system', 'computer']]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
unseen_doc = other_corpus[0]
vector = lda[unseen_doc]  # get topic probability distribution for a document

lda.update(other_corpus)
vector = lda[unseen_doc]
Exemplo n.º 9
0
class LDATagger:
    _lda_model = None
    _dictionary = None
    _lda_model_path = None
    _dictionary_path = None
    DEFAULT_MODEL_PATH = os.path.join(os.path.dirname(__file__), "model")
    DEFAULT_NUM_TOPICS = 1000

    def __init__(self,
                 model_path=DEFAULT_MODEL_PATH,
                 num_topics=DEFAULT_NUM_TOPICS,
                 lock=threading.Lock()):
        self.save_model_lock = lock

        if os.path.isfile(model_path):
            raise Exception("Invalid Model Path; Should Be a Directory")
        if not os.path.exists(model_path):
            os.makedirs(model_path)
        self._lda_model_path = os.path.join(model_path, "lda.model")
        self._dictionary_path = os.path.join(model_path, "tokens.dict")
        self.num_topics = num_topics
        self.model_folder_lock = FileLock(model_path)

    def topics_for_documents(self, doc_tokens_map):
        self.check_and_load_model()
        doc_topics_map = defaultdict(list)
        for document_id, document_tokens in doc_tokens_map.iteritems():
            doc_topics_map[document_id] = self.topics_for_document(
                document_tokens)
        return doc_topics_map

    def topics_for_document(self, tokens):
        self.check_and_load_model()
        bow_tokens = self._dictionary.doc2bow(tokens)
        topics = self._lda_model[bow_tokens]
        return topics

    def build_topics(self, tokens_list):
        self._dictionary = Dictionary(tokens_list)
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model = LdaModel(corpus=corpus,
                                   id2word=self._dictionary,
                                   num_topics=self.num_topics,
                                   passes=100)
        self.save_model()

    def save_model(self, sleep_for_test=False, mock_datastruct=None):
        self.save_model_lock.acquire()
        self.model_folder_lock.acquire()
        if mock_datastruct: mock_datastruct.acquire()
        if sleep_for_test:
            import time
            time.sleep(1)
        print
        "Acquired Lock "
        try:
            self._lda_model.save(self._lda_model_path)
            self._dictionary.save(self._dictionary_path)
        finally:
            print
            "Released Lock"
            if mock_datastruct: mock_datastruct.release()
            self.model_folder_lock.release()
            self.save_model_lock.release()

    def check_and_load_model(self):
        if self._lda_model and self._dictionary:
            return
        if os.path.exists(self._lda_model_path):
            self._lda_model = LdaModel.load(self._lda_model_path)
        else:
            raise Exception("LDA Model Not found in the path")
        if os.path.exists(self._dictionary_path):
            self._dictionary = Dictionary.load(self._dictionary_path)
        else:
            raise Exception("Tokens Dictionary Not found in the path")

    def update_model(self, tokens_list):
        self.check_and_load_model()
        corpus = [
            self._dictionary.doc2bow(document_tokens)
            for document_tokens in tokens_list
        ]
        self._lda_model.update(corpus=corpus)
        self.save_model()

    def build_or_update_model(self, tokens_list):
        if not self.does_model_exist():
            self.build_topics(tokens_list)
        else:
            self.update_model(tokens_list)

    def does_model_exist(self):
        if os.path.exists(self._lda_model_path) and os.path.exists(
                self._dictionary_path):
            return True
        return False

    def get_model(self):
        self.check_and_load_model()
        model_hash = {
            "lda_model": cPickle.dumps(self._lda_model),
            "dictionary": cPickle.dumps(self._dictionary)
        }
        return model_hash

    def restore_model(self, model_hash):
        self._lda_model = cPickle.loads(
            model_hash["lda_model"].encode('utf-8'))
        self._dictionary = cPickle.loads(
            model_hash["dictionary"].encode('utf-8'))
        self.save_model()

    def topics_to_tokens(self):
        topics_tokens_map = defaultdict(list)
        if not self.does_model_exist():
            return []
        else:
            model = self._lda_model
            topics_to_tokens = model.show_topics(
                topics=self.DEFAULT_NUM_TOPICS,
                topn=25,
                log=False,
                formatted=False)

            for topic_id, tokens in enumerate(topics_to_tokens):
                topics_tokens_map[topic_id] = self.list_of_tuples_to_hash(
                    tokens)

            return topics_tokens_map

    def list_of_tuples_to_hash(self, tokens):
        tokens_hash = defaultdict(float)
        for token_probability, token in tokens:
            tokens_hash[token] = token_probability
        return tokens_hash
Exemplo n.º 10
0
class LdaTest(unittest.TestCase):

    def setUp(self):
        self.lda = LdaModel(corpus = common_corpus, id2word = common_dictionary, num_topics = 10)

    def test_common_dictionary(self):
        """
        Test if dictionary of our model is equal to common_dictionary
        :return:
        :rtype:
        """
        dictionary = {'computer': 0, 'human': 1, 'interface': 2, 'response': 3, 'survey': 4, 'system': 5, 'time': 6,
                      'user'    : 7, 'eps': 8, 'trees': 9, 'graph': 10, 'minors': 11}
        self.assertEqual(self.lda.id2word.token2id, dictionary)

    def test_common_texts(self):
        """
        Test if the order of common_texts change.
        :return:
        :rtype:
        """
        texts = [['human', 'interface', 'computer'], ['survey', 'user', 'computer', 'system', 'response', 'time'],
                 ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'],
                 ['user', 'response', 'time'],
                 ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']]
        self.assertEqual(common_texts, texts)

    def test_common_corpus(self):
        """
        Test if the order of common_corpus change.
        :return:
        :rtype:
        """
        corpus = [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
                  [(2, 1), (5, 1), (7, 1), (8, 1)],
                  [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)],
                  [(9, 1), (10, 1), (11, 1)],
                  [(4, 1), (10, 1), (11, 1)]]
        self.assertEqual(corpus, common_corpus)

    def test_new_corpus(self):
        """
        Test if the bow representation of new corpus is consistent
        :return:
        :rtype:
        """
        other_texts_without_unseen_word = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]]
        other_corpus_without_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_without_unseen_word]
        self.assertEqual(other_corpus_without_unseen_word[0], [(0, 1), (6, 1), (10, 1)])
        other_texts_with_unseen_word = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]]
        other_corpus_with_unseen_word = [common_dictionary.doc2bow(text) for text in other_texts_with_unseen_word]
        self.assertEqual(other_corpus_with_unseen_word[0], [(0, 1), (6, 1), (10, 1)])

    def test_lda_update_1(self):
        """
        Update with unseen text, which doesn't have new words.
        :return:
        :rtype:
        """
        other_texts = [["computer", "time", "graph"], ["survey", "response", "eps"], ["human", "system", "computer"]]
        other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
        original_model = copy.deepcopy(self.lda)
        # Inplace update
        self.lda.update(other_corpus)
        self.assertNotEqual(self.lda, original_model)
        self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)

    def test_lda_update_2(self):
        """
        Update with unseen text, which have new words.
        I add 'hardware', 'administrator' to test.
        :return:
        :rtype:
        """
        other_texts = [["computer", "graph", "hardware", "time", ], ["survey", "response", "eps", "administrator"]]
        other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]
        original_model = copy.deepcopy(self.lda)
        self.lda.update(other_corpus)
        self.assertEqual(self.lda.id2word.token2id, original_model.id2word.token2id)
Exemplo n.º 11
0
class LDAModel:
    """
    Base class for LSA model.
    """
    def __init__(self, vector_length):
        """
        Initialize model with parameters. Model is fit if it has not been done before.

        :param vector_length: Number of topics in model.
        """

        self.shortname = 'LDA'
        self.name = 'LDAmodel' + str(vector_length)
        self.vector_length = vector_length
        self.remove_stopwords = None
        self.word_dict = None
        self.path = None
        self.model = None
        self.doc_vecs = None

    def set_dict(self,
                 data,
                 remove_stopwords=False,
                 no_below=1,
                 no_above=1,
                 filter_most_frequent=0):
        """
        Set/make dictionary to be used for bow representations.

        :param data: Which data to use for making dictionary.
        :param remove_stopwords: Whether to remove stopwords.
        :param no_below: Minimum number of documents a word has to appear in to be included.
        :param no_above: Maximum fraction of documents a word can appear in to be included.
        :param filter_most_frequent: Remove the most frequent words.
        """

        if self.word_dict != None:
            print(
                "Model already have a dictionary! This function call does nothing. "
            )
            return

        self.name = '%s_%sdict_rs%s_nb%s_na%s_fmf%s' % (
            self.name, data.name, str(remove_stopwords), str(no_below),
            str(no_above), str(filter_most_frequent))

        self.remove_stopwords = remove_stopwords
        self.word_dict = data.get_dictionary(remove_stopwords, no_below,
                                             no_above, filter_most_frequent)

    def train(self, data, passes):
        """
        Fit LSA model to the data, set document topic vectors and calculate distances.

        :param data: Data to fit model on
        """

        if self.word_dict == None:
            print(
                "Dictionary must be assigned to model before training. This function call does nothing"
            )
            return
        if self.model == None:
            self.model = LdaModel(num_topics=self.vector_length,
                                  id2word=self.word_dict,
                                  alpha='auto')  #, eta='auto')

        self.name = '%s_%strain_p%s' % (self.name, data.name, str(passes))
        self.path = Path('modelfiles/%s/%s' % (data.name, self.name))

        try:
            self.model = LdaModel.load(str(self.path / '.model'))
        except:
            self.path.mkdir(parents=True, exist_ok=True)

            print("Training model...", end='')
            time.sleep(0.1)

            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            self.model.update(datastream,
                              passes=passes,
                              chunksize=20000,
                              iterations=500)

            self.model.save(str(self.path / '.model'))

    def fit(self, data):
        """
            Fit LSA model to the data, set document topic vectors and calculate distances.
        """

        if self.model == None:
            print(
                "Model must be trained first. This function call does nothing")
            return

        try:
            self.doc_vecs = pd.read_csv(
                self.path / str('document_vectors_%s.csv' % data.name),
                index_col=0)
        except:

            print("Fitting model...", end='')
            time.sleep(0.1)

            # Container for document topic vectors with zeros
            doc_vecs = np.zeros((len(data.ids), self.vector_length))

            # For each document
            datastream = GetBow(data, self.remove_stopwords, self.word_dict)
            for i in range(len(datastream)):

                # element is now a tuple with index and value for nonzero vector elements
                for element in self.model[datastream[
                        i]]:  #self.model.get_document_topics(datastream[i], minimum_probability=0.0):

                    # Set nonzero elements in container
                    doc_vecs[i][element[0]] = element[1]

            # Set document topic vectors as pandas dataframe
            self.doc_vecs = pd.DataFrame(doc_vecs, index=data.ids)
            self.doc_vecs.to_csv(self.path /
                                 str('document_vectors_%s.csv' % data.name))
Exemplo n.º 12
0
class TweetLDA:
    def __init__(self, date):
        self.date = date
        self.documents = []
        self.tweet_ids = []

        self.bigram = None
        self.b_min = 90

        self.dictionary = None
        self.corpus = None

        # Training parameters
        self.num_topics = 6
        self.chunksize = 60000
        self.passes = 20
        self.iterations = 400
        self.eval_every = None

        self.model = None

    def compute_bigram(self):
        '''
        Find and save bigrams living among the tweets

        :update: [covid_tweets].[token_tweets]
        '''
        print("Computing bigram.")
        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        count_query = '''
            SELECT count(tweet_id)
            FROM token_tweets
            WHERE date = ?'''

        cursor.execute(count_query, (self.date, ))
        num_tweets = cursor.fetchone()[0]
        print(self.date, num_tweets, "to have bigram computed.")

        query = '''
            SELECT tweet_id, tokenized_tweet
            FROM token_tweets
            WHERE date = ?'''

        cursor.execute(query, (self.date, ))
        results = cursor.fetchall()

        cnxn.close()

        retokenized_tweets = []
        for tweet_id, tokenized_tweet in results:
            tweet_tokens = tokenized_tweet.split(" ")
            retokenized_tweets.append(tweet_tokens)

        phrases = Phrases(retokenized_tweets, min_count=self.b_min)
        bigram = Phraser(phrases)

        bigram.save(f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl")
        print("Bigram computed.")

    def load_bigram(self):
        '''
        Search for and load a pre-existing bigrams file

        :update: self.bigram
        '''
        self.bigram = Phraser.load(
            f"./tmp/{self.date}_bigram_model_{self.b_min}.pkl")

        print("Bigram loaded.")

    def prepare_documents(self):
        '''
        Integrate bigrams into the documents so they can be used for the model

        :update: self.documents
        '''
        print("Preparing documents.")
        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        query = '''
            SELECT tweet_id, tokenized_tweet
            FROM token_tweets
            WHERE date = ?
            AND in_model = 0
            LIMIT 50000'''

        cursor.execute(query, (self.date, ))
        results = cursor.fetchall()

        cnxn.close()

        if len(results) == 0:
            raise ValueError

        for tweet_id, tt in results:
            self.documents.append(tt.split(" "))
            self.tweet_ids.append(tweet_id)

        for i in range(len(self.documents)):
            for token in self.bigram[self.documents[i]]:
                if '_' in token:
                    self.documents[i].append(token)

        print("Documents have been prepared.")

    def update_documents(self):
        '''
        Flag that documents have been added to the LDA model

        :update: [token_tweets]
        '''
        print("Updated relevant documents in [token_tweets]")
        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        update_query = '''
            UPDATE token_tweets
            SET in_model = 1
            WHERE tweet_id IN (%s)'''

        cursor.execute(update_query % ','.join('?' * len(self.tweet_ids)),
                       self.tweet_ids)
        cnxn.commit()
        cnxn.close()

    def generate_dictionary(self):
        '''
        Create a dictionary representation of the documents, filtering extremes.

        :update: self.dictionary, gensim Dictionary object
        '''
        print("Generating dictionary.")
        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        query = '''
            SELECT tokenized_tweet
            FROM token_tweets
            WHERE date = ?'''

        cursor.execute(query, (self.date, ))
        results = cursor.fetchall()

        cnxn.close()

        self.documents = [tt.split(" ") for tt, in results]

        for i in range(len(self.documents)):
            for token in self.bigram[self.documents[i]]:
                if '_' in token:
                    self.documents[i].append(token)

        self.dictionary = Dictionary(self.documents)
        self.dictionary.filter_extremes(no_below=30, no_above=0.50)

        self.dictionary.save(f"./tmp/{self.date}_dictionary.pkl")
        print("Dictionary has been saved.")

    def load_dictionary(self):
        '''
        Load a dictionary of the associated documents.

        :update: self.corpus, list of Bag-of-Word documents
        '''
        self.dictionary = Dictionary()
        self.dictionary = self.dictionary.load(
            f"./tmp/{self.date}_dictionary.pkl")

        print("Dictionary loaded.")

    def generate_corpus(self):
        '''
        Create a Bag-of-Words representation corpora. Ready to be trained.

        :update: self.corpus, list of Bag-of-Word documents
        '''
        self.corpus = [self.dictionary.doc2bow(d) for d in self.documents]

    def generate_model(self):
        '''
        Utilizting the python Gensim library and the prepared corpus, create a
        trained LDA model.

        :update: self.model, LdaModel object
        '''
        temp = self.dictionary[0]
        id2word = self.dictionary.id2token

        print("Model generation is beginning.")

        self.model = LdaModel(corpus=self.corpus,
                              id2word=id2word,
                              chunksize=self.chunksize,
                              alpha='auto',
                              eta='auto',
                              iterations=self.iterations,
                              num_topics=self.num_topics,
                              passes=self.passes,
                              eval_every=self.eval_every)

        print("Model generated.")

        temp_file = datapath(f"{self.date}_model")
        print(temp_file)
        self.model.save(f"./tmp/{self.date}_model")
        print("Model has been saved.")

        self.update_documents()

        pprint(self.model.top_topics(self.corpus))

    def load_model(self):
        '''
        Load a pre-trained model to be analyze or updated

        :update: self.model, LdaModel object
        '''
        temp_file = datapath(f"{self.date}_model")
        print(temp_file)
        #self.model = LdaModel.load(temp_file)
        self.model = LdaModel.load(f"./tmp/{self.date}_model")

    def update_model(self):
        '''
        Update the pre-existing model with a new corpus

        :update: self.documents
        :update: self.model
        :udpate: self.corpus
        '''
        self.prepare_documents()
        self.generate_corpus()

        print(f"{self.date}_model is being updated.")
        self.model.update(self.corpus, chunksize=self.chunksize)

        temp_file = datapath(f"{self.date}_model")
        print(temp_file)
        self.model.save(temp_file)
        print("Model has been saved.")

        pprint(self.model.top_topics(self.corpus))

        self.update_documents()

    def analyze_model(self):
        '''
        Examine the top topics of the model.
        '''

        cnxn = sqlite3.connect("covid_tweets.db")
        cursor = cnxn.cursor()

        query = '''
            SELECT tokenized_tweet
            FROM token_tweets
            WHERE date = ?
            AND in_model = 1'''

        cursor.execute(query, (self.date, ))
        results = cursor.fetchall()

        cnxn.close()

        print(len(results), "documents are in the model.")

        for tt, in results:
            self.documents.append(tt.split(" "))

        for i in range(len(self.documents)):
            for token in self.bigram[self.documents[i]]:
                if '_' in token:
                    self.documents[i].append(token)

        self.generate_corpus()

        self.top_topics = self.model.top_topics(self.corpus)
        pprint(self.top_topics)
        self.save_top_topics()

    def output_topics_json(self, values):
        '''
        Output json from a list of tuples
        '''
        to_json = []
        for date, topic_num, word, probability in values:
            to_json.append({
                "date": date,
                "topic_num": topic_num,
                "word": word,
                "probability": str(probability)
            })

        with open(f"./tmp/{self.date}_topics.json", "w") as outfile:
            json.dump(to_json, outfile)

    def save_top_topics(self):
        '''
        Given the top topics, save them to a .json file
        '''
        to_save = []
        for i, topic in enumerate(self.top_topics, 1):
            for probability, word in topic[0]:
                to_save.append((self.date, i, word, probability))

        self.output_topics_json(to_save)
Exemplo n.º 13
0
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

#Train the model on the corpus
lda = LdaModel(common_corpus, num_topics=10)
'''一步步拆解来看,首先common_texts是list形式,里面的每一个元素都可以认为是一篇文档也是list结构:'''
print(type(common_texts))
print(common_texts[0])
'''第二步,doc2bow这个方法用于将文本转化为词袋形式,看一个官方的示例大家应该就能明白了,'''
from gensim.corpora import Dictionary
dct = Dictionary(["máma mele maso".split(), "ema má máma".split()])
print(dct.doc2bow(["this", "is", "máma"]))
print(dct.doc2bow(["this", "is", "máma"], return_missing=True))
'''初始化的时候对每一个词都会生成一个id,新的文本进去的时候,返回该文本每一个词的id,和对应的频数,对于那些不存在原词典的,可以控制是否返回。
此时生成的corpus就相当于是LDA训练模型的输入了,让我们检查一下:'''
print(common_corpus[0])
# human单词的id为0,且在第一个文档中只出现了一次
'''最后一步,我们只需调用LDA模型即可,这里指定了10个主题。'''
from gensim.models import LdaModel
lda = LdaModel(common_corpus, num_topics=10)
'''让我们检查一下结果(还有很多种方法大家可以看文档),比如我们想看第一个主题由哪些单词构成:'''
print(lda.print_topic(1, topn=2))
'''可以看出第一个模型的词分布,9号10号占比较大(这里topn控制了输出的单词个数,对应的单词可以通过之前生成dict找出)
我们还可以对刚才生成的lda模型用新语料去进行更新,'''
'''
# 能更新全部参数
lda.update(other_corpus)
#还能单独更新主题分布, 输入为之前的参数,其中rho指学习率
lda.update_alpha(gammat, rho)
#还能单独更新词分布
lda.update_eta(lambdat, rho)
'''