示例#1
0
def train_lda(args):
	print "[LDA > n_topics: %d ]" % args.dim	
	lda_reader = LDAReader(args.ds, max_sent=args.max_sent)		
	ldazito = LdaMulticore(lda_reader, id2word=lda_reader.idx2wrd,
									   num_topics=args.dim, 
									   workers=args.workers)
	ldazito.save(args.out)	
示例#2
0
文件: lda.py 项目: aleju/ner-crf
def train_lda():
    """
    Train the LDA model.
    generate_dictionary() must be called before this method.
    """
    print("------------------")
    print("Training LDA model")
    print("------------------")

    # load dictionary, as generated by generate_dictionary()
    print("Loading dictionary...")
    dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH)

    # generate a mapping from word id to word
    print("Generating id2word...")
    id2word = {}
    for word in dictionary.token2id:
        id2word[dictionary.token2id[word]] = word

    # initialize LDA
    print("Initializing LDA...")
    lda_model = LdaMulticore(corpus=None, num_topics=cfg.LDA_COUNT_TOPICS, id2word=id2word,
                             workers=LDA_COUNT_WORKERS, chunksize=LDA_CHUNK_SIZE)

    # Train the LDA model
    print("Training...")
    examples = []
    update_every_n_windows = 25000
    windows = load_windows(load_articles(cfg.ARTICLES_FILEPATH), cfg.LDA_WINDOW_SIZE,
                           only_labeled_windows=True)
    for i, window in enumerate(windows):
        tokens_str = [token.word.lower() for token in window.tokens]
        bow = dictionary.doc2bow(tokens_str) # each window as bag of words
        examples.append(bow)
        if len(examples) >= update_every_n_windows:
            print("Updating (at window %d of max %d)..." % (i, COUNT_EXAMPLES_FOR_LDA))
            # this is where the LDA model is trained
            lda_model.update(examples)
            examples = []
        if i >= COUNT_EXAMPLES_FOR_LDA:
            print("Reached max of %d windows." % (COUNT_EXAMPLES_FOR_LDA,))
            break

    # i don't update here with the remainder of windows, because im not sure if each update step's
    # results are heavily influenced/skewed by the the number of examples
    #if len(examples) > 0:
    #    print("Updating with remaining windows...")
    #    lda_model.update(examples)

    # save trained model to HDD
    print("Saving...")
    lda_model.save(cfg.LDA_MODEL_FILEPATH)
示例#3
0
class LdaProcessor(object):
    def __init__(self, token_docs, **filter_extremes_args):
        """
        token_docs : a list of lists of word or n-gram or sentence tokens.
            Eg, [['the','crazy','cat'],['that','doggone','dog']]
        """
        self.token_docs = token_docs
        self.id2word = corpora.Dictionary(token_docs)
        if filter_extremes_args:
            print 'filtering words with extreme frequencies'
            self.id2word.filter_extremes(**filter_extremes_args)
        # initialize the bow_corpus
        self.reset_bow_corpus(token_docs)

        print 'Got %i total tokens (words)' % len(self.id2word)

    def reset_bow_corpus(self, documents):
        """set or reset the corpus with the given documents"""
        self.bow_corpus = [self.id2word.doc2bow(doc) for doc in documents]
        return None

    def train_lda(self, num_topics, **kwargs):
        print 'training LDA...'
        self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs)
        return self

    def word_topics(self, num_words=10):
        return [topic[1] for topic in self.lda.print_topics(num_topics=self.lda.num_topics, num_words=num_words)]

    # utility functions
    def significant_topic_terms(self, topicid):
        raise NotImplementedError()
示例#4
0
文件: lda.py 项目: aleju/ner-crf
def test_lda(sentence):
    """Tests the trained LDA model on an example sentence, i.e. returns the topics of that
    sentence.
    May only be called after train_lda().

    Args:
        sentence: A sentence to test on as string.
    """
    # validate and process the sentence
    if sentence is None or len(sentence) < 1:
        raise Exception("Missing or empty 'sentence' argument.")

    sentence = sentence.decode("utf-8").lower().strip().split(" ")
    if len(sentence) != cfg.LDA_WINDOW_SIZE:
        print("[INFO] the token size of your sentence does not match the defined window " \
              "size (%d vs %d)." % (len(sentence), cfg.LDA_WINDOW_SIZE))

    # load dictionary and trained model
    dictionary = gensim.corpora.dictionary.Dictionary.load(cfg.LDA_DICTIONARY_FILEPATH)
    lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH)

    # sentence to bag of words
    bow = dictionary.doc2bow(sentence)

    # print topics of sentence
    print(lda_model[bow])
示例#5
0
class Lda(BaseEstimator, TransformerMixin):
    def __init__(self, id2word=None, num_topics=25, passes=1):
        self.lda = None
        self.id2word = id2word
        self.num_topics = num_topics
        self.passes = passes

    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(
                    id2word=self.id2word, num_topics=self.num_topics, passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)

    def transform(self, X):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        topic_vectors : [np.ndarray]
            each matrix is of shape (sent_count, topic_count)
        """
        topic_vectors = []
        for doc in X:
            sents_bow = Sparse2Corpus(doc, documents_columns=False)
            gamma, _ = self.lda.inference(sents_bow)
            # divide row by row sum
            topic_dist = (gamma.T / np.sum(gamma, axis=1)).T
            topic_vectors.append(topic_dist)
        return topic_vectors
示例#6
0
 def load(self):
     """
     Load previous saved ldaprocessor results
     """
     try:
         return LdaMulticore.load(self.lda_out_file_name)
     except:
         return None
示例#7
0
    def perform(self, option="load"):
        """
        Perform LDA analysis to generate topics
        and topic distribution for each app
        """
        logging.info("Start Lda analysis")

        ldamodel = LdaMulticore(self.corpus, num_topics=self.ntopic, id2word=self.dictionary, passes=self.iteration)
        logging.info("LDA multicore modeling done")

        ldamodel.save(self.lda_out_file_name)

        self.topics = {}
        for i in range(0, self.ntopic, 1):
            self.topics["topic{}".format(i)] = ldamodel.show_topic(i, topn=self.nword)
            logging.info("Topic{}".format(i))
            words = [w[1] for w in self.topics["topic{}".format(i)]]
            logging.info(words)
示例#8
0
def main():
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    args = parse_args()

    dictionary = corpora.Dictionary.load(os.path.join(args.prefix, 'review.dict'))
    logging.info('Pruning dictionary')
    dictionary.filter_extremes(no_below=args.no_below,
                               no_above=args.no_above)

    corpus = ReviewCorpus(os.path.join(args.prefix, 'review.json'),
                          dictionary)

    logging.info('Computing LDA model')
    lda = LdaMulticore(corpus, num_topics=args.num_topics, id2word=dictionary,
                       workers=args.workers)

    logging.info('Persisting LDA model')
    lda.save(os.path.join(args.prefix, 'review.ldamodel'))
示例#9
0
 def build_model(self, fname=None, save_to=None):
     id2word = self.id2word or self.build_id2word()
     corpus = self.corpus or self.build_corpus()
     # read model.lda file
     if not fname:
         fname = click.prompt('model file name', type=str, default='model.lda')
     fname = self.__dest(fname)
     # if there is no model file or the user wants to rebuild, build .model
     if not os.path.isfile(fname) or click.confirm('There already is %s. Do you want to re run lda?' % fname):
         num_procs = click.prompt('Number of processes to launch',
                                  type=int,
                                  default=multiprocessing.cpu_count())
         num_epochs = click.prompt('Number of epochs to run', type=int, default=20)
         num_topics = click.prompt('Number of topics', type=int, default=100)
         print 'start building model'
         start = time()
         model = LdaMulticore(corpus, id2word=id2word, num_topics=num_topics, workers=num_procs, passes=num_epochs)
         model.save(fname) #save
         print 'building model takes: %s' % LdaUtils.human_readable_time(time() - start)
     self.model = LdaMulticore.load(fname)
     return self.model
示例#10
0
文件: lda.py 项目: aleju/ner-crf
 def __init__(self, lda_filepath, dictionary_filepath, cache_filepath=None):
     """Initialize the LDA wrapper.
     Args:
         lda_filepath: Filepath to the trained LDA model.
         dictionary_filepath: Filepath to the dictionary of the LDA.
         cache_filepath: Optional filepath to a shelve cache for the LDA results.
     """
     self.lda = LdaMulticore.load(lda_filepath)
     self.dictionary = gensim.corpora.dictionary.Dictionary.load(dictionary_filepath)
     self.cache_synch_prob = 2 # in percent, 1 to 100
     self.cache_filepath = cache_filepath
     self.cache = shelve.open(cache_filepath) if cache_filepath is not None else None
示例#11
0
def fit_numtopics(train_corpus, test_corpus, id2word, num_topics_list, iters, workers, chunksize, logfilename, save=True):

	"""
	Args: 
	num_topics_list = list of number of topics, a model will be fitted for each
	save: indicates whether model should be saved
	Returns: topics_dict = a dictionary of topics lists, where the key is the number of topics
	"""
	topics_dict = {}
	logfile = open(logfilename, 'w')
	for num_topics in num_topics_list:
		
		print('training', num_topics)
		np.random.seed(NUM)

		start_time = time.time()
		model = LdaMulticore(corpus=train_corpus, id2word=id2word,
							 num_topics=num_topics, iterations=iters,
							 eval_every=None, workers=workers,
							 chunksize=chunksize)
		end_time = time.time()

		if save:
			fname = 'data\\orig_' + str(num_topics) + 'topics.lda'
			model.save(fname)

		per_word_bound = model.log_perplexity(test_corpus)
		perplexity = np.exp2(-1.0 * per_word_bound)

		logfile.write('\n' + 'num_topics: ' + str(num_topics) + '\n')
		logfile.write('perplexity: ' + str(perplexity) + '\n')
		logfile.write('train_time: ' + str(end_time - start_time) + '\n' + 'Topics: \n')

		topics = model.show_topics(num_topics=num_topics, num_words=20)
		topics_dict[str(num_topics)] = topics
		for topic in topics:
			logfile.write('\n\t' + topic.encode('ascii', 'ignore')  + '\n')

	logfile.close()		
	return topics_dict
示例#12
0
文件: lda.py 项目: Vivarta/geiger
    def train(self, comments):
        """
        Build the topic model from a list of documents (strings).

        Assumes documents have been pre-processed (e.g. stripped of HTML, etc)
        """
        docs = [c.body for c in comments]
        vecs = self.vectr.vectorize(docs, train=True)
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)

        if self.verbose:
            self.print_topics()
示例#13
0
    def build_lda_model(self, topics: int=20):
        ignore_words = [
            'like', 'know', 'f**k', 'f*****g', 'want', 'shit', 'know', 'sure',
            'isn', 'CHANBOARD', 'think', 'people', 'good', 'time', 'going',
            'WEBLINK', 'got', 'way', ''
        ]
        filename = op.join(self.input_dir, f'{self.board}.dictionary')
        dictionary: Dictionary = Dictionary.load(filename)
        documents = ReadThreads(
            self.board, input_dir=self.input_dir, file_type='phrases',
            return_func=lambda x, y: dictionary.doc2bow(
                [w for w in y.split() if w not in ignore_words]
            )
        )

        lda = LdaMulticore(
            documents, id2word=dictionary, num_topics=topics, iterations=2)

        filename = op.join(self.input_dir, f'{self.board}.lda')
        lda.save(filename)

        return lda
示例#14
0
文件: lda.py 项目: aleju/ner-crf
def show_topics():
    """Shows all topics of the trained LDA model.
    May only be called after train_lda().
    """
    # load trained model
    lda_model = LdaMulticore.load(cfg.LDA_MODEL_FILEPATH)

    # list the topics
    topics = lda_model.show_topics(num_topics=cfg.LDA_COUNT_TOPICS, num_words=10, log=False,
                                   formatted=True)

    print("List of topics:")
    for i, topic in enumerate(topics):
        # not adding topic to the tuple here prevents unicode errors
        print("%3d:" % (i,), topic)
示例#15
0
    def fit(self, X, y=None):
        """
        Parameter
        ---------
        X : [sp.csr_matrix]

        Returns
        -------
        self
        """
        if self.lda is None:
            self.lda = LdaMulticore(
                    id2word=self.id2word, num_topics=self.num_topics, passes=self.passes)
        X_flat = sp.vstack(X)
        self.lda.update(Sparse2Corpus(X_flat, documents_columns=False))
        return self
示例#16
0
def main():
    options = {
        'corpus_file': 'data\\origtweets_dtm.pkl',
        'id_file': 'data\\row_origtweets.csv',
        'model_file': 'data\\orig_10topics.lda',
        'meta_file': 'data\\origtweets_meta.csv',
        'output_file': 'data\\origtweets_topics.csv'
    }

    start_time = time.time()
    id_df = pd.read_csv(options['id_file'], usecols=['row'], dtype='float')
    meta_df = pd.read_csv(options['meta_file'])

    with open(options['corpus_file']) as corpus_file:
        corpus = pickle.load(corpus_file)
    lda = LdaMulticore.load(options['model_file'])

    if len(meta_df) != len(corpus):
        print ('Warning: Some documents may have been deleted during processing.\n')
        print ('metadata size - corpus size = ' + str(len(meta_df) - len(corpus)))

    topic_features = [to_dense(lda[bow], lda.num_topics) for bow in corpus]

    topic_colname = 'topic{0}'.format
    topic_colnames = [topic_colname(t+1) for t in xrange(lda.num_topics)]
    topic_df = pd.DataFrame.from_records(topic_features, columns=topic_colnames)
    with open('data\\topic_df.pkl', 'wb') as pkl_file:
        pickle.dump(topic_df, pkl_file)


    print ('topic size - id size = ' + str(len(id_df) - len(topic_df)))
    if len(id_df) != len(topic_df):
       raise Exception()

    topic_df = pd.concat([id_df, topic_df], axis=1)
    
    merged_df = pd.merge(meta_df, topic_df, on='row', how='right', sort=False)
    merged_df.to_csv(options['output_file'], index=False)

    end_time = time.time()
    print ('running time: ' + str((end_time - start_time)/60) + ' minutes')
示例#17
0
文件: lda.py 项目: frnsys/geiger
class LDA(Pipe):
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    Takes vectors and returns topic vectors,
    which can be used for clustering.
    """
    input = Pipe.type.vecs
    output = Pipe.type.vecs

    def __init__(self, n_topics=5):
        self.n_topics = n_topics
        self.trained = False

    def __call__(self, vecs):
        """
        Return topic vectors.
        """
        if not self.trained:
            self.train(vecs)
            self.trained = True

        distribs = []
        for distrib in self.m[Scipy2Corpus(vecs)]:
            distribs.append([t[1] for t in distrib])
        distribs = np.array(distribs)
        return distribs

    def train(self, vecs):
        """
        Build the topic model.
        """
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)

    def print_topics(self, vectorizer):
        vocab = vectorizer.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
示例#18
0
def get_topics():
    '''Computes distribution over topics for each abstract'''

    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore.load('lda.gensim')

    base = 'datasets/dspace'
    new_base = 'datasets/dspace_topics'
    for filename in tqdm(os.listdir(base)):
        path = os.path.join(base, filename)
        with open(path, 'r') as f:
            d = json.load(f)
            abstract = d['abstract']
            if abstract is not None:
                words = tokenize(abstract.split())
                bow = dictionary.doc2bow(words)
                topics = lda.get_document_topics(bow, minimum_probability=0)
                topics = to_vec(topics)
                d['topics'] = topics
                new_path = os.path.join(new_base, filename)
                with open(new_path, 'w') as new_f:
                    json.dump(d, new_f)
#==============================================================================
# #Train LDA model Take 1655 seconds to train the model
#==============================================================================
# No need to run LDA everytime, model has bee stored
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(res)
vocab = vectorizer.get_feature_names()


# single LDA
topic_number = 3
start_time = time.time()
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=topic_number,passes=10,
                    chunksize=5000,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))
fname = folder_name+'LDA'+str(topic_number)+'topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#perplexity
perplexity = model.log_perplexity(matutils.Sparse2Corpus(X,documents_columns=False), total_docs=None)

示例#20
0
def learn(corpus):
    dictionary = Dictionary.load('lda.dict')
    lda = LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=NUM_TOPICS, chunksize=10000, passes=5)
    for line in lda.print_topics(NUM_TOPICS):
        print line
    lda.save('lda.gensim')
示例#21
0
    def generate_embedings(self,
                           method="tf-idf",
                           tag=None,
                           tag_column=None,
                           return_model=False):
        # Coleta os dados dos embedings e salva em um arquivo par ao multiprocess
        if tag != None and tag_column != None:
            if (tag_column not in self.df.columns):
                raise ValueError(f"Tag {tag_column} not found in dataset")
            elif tag not in self.df[tag_column].to_list():
                raise ValueError(
                    f"Tag {tag} not found in dataset column {tag_column}")
            texts = self.df[self.df[tag_column] == tag][self.text_column]
        else:
            texts = self.df[self.text_column]

        with open('storage/texts.txt', 'w', encoding='utf8') as file:
            for sentence in texts:
                file.write(" ".join([tok for tok in sentence]) + "\n")

        # Verifica se usuario cometeu um erro no imput das tags
        if tag != None and tag_column == None:
            raise ValueError("if passing tag must pass tag_column as well")

        if tag_column != None and tag == None:
            raise ValueError("if passing tag_column must pass tag as well")

        # Verifica se o vetor ja foi gerado e se o alvo é o corpus inteiro
        if method in self.embedings and tag == None:
            if return_model:
                return self.embedings[method]
            else:
                return self.embedings[method][0]

        # Realiza o tf-idf
        if method == "tf-idf":
            model = TfidfVectorizer(min_df=5,
                                    max_df=0.9,
                                    max_features=5000,
                                    sublinear_tf=False,
                                    analyzer=lambda x: x)

            vectors = model.fit_transform(texts)

        # Realiza o Word2Vec
        elif method == "word2vec" or method == "cbow":
            model = gensim.models.Word2Vec(corpus_file='storage/texts.txt',
                                           window=5,
                                           size=200,
                                           min_count=5,
                                           iter=100,
                                           workers=4)

            vectors = model.wv
            if tag == None:
                self.embedings["word2vec"] = vectors

            # Realiza o cbow
            if method == "cbow":
                vectors = []
                for text in texts:
                    vec = np.zeros(model.wv.vector_size)
                    for word in text:
                        if word in model.wv.vocab:
                            vec += model.wv.get_vector(word)

                    norm = np.linalg.norm(vec)
                    if norm > np.finfo(float).eps:
                        vec /= norm
                    vectors.append(vec)

                vectors = scipy.sparse.csr.csr_matrix(vectors)

        # Realiza o Doc2Vec
        elif method == "doc2vec":

            model = gensim.models.Doc2Vec(corpus_file='storage/texts.txt',
                                          vector_size=200,
                                          window=5,
                                          min_count=5,
                                          workers=12,
                                          epochs=100)

            vectors = scipy.sparse.csr.csr_matrix(model.docvecs.vectors_docs)

        # Realiza a LDA
        elif "lda" in method:
            if "_" in method:
                NUM_TOPICS = int(method.split("_")[-1])
            else:
                NUM_TOPICS = 20

            dictionary = Dictionary(texts)
            doc2bow = [dictionary.doc2bow(text) for text in texts]
            ldamodel = LdaMulticore(doc2bow,
                                    num_topics=NUM_TOPICS,
                                    id2word=dictionary,
                                    passes=30)

            raw_vecs = [ldamodel.get_document_topics(text) for text in doc2bow]

            lda_vecs = []
            for vec in raw_vecs:
                this_vec = []
                curr = 0
                for i in range(ldamodel.num_topics):
                    if (i == vec[curr][0]):
                        this_vec.append(vec[curr][1])
                        curr += 1
                        if curr == len(vec):
                            curr = -1
                    else:
                        this_vec.append(0)
                lda_vecs.append(this_vec)

            vectors = scipy.sparse.csr.csr_matrix(lda_vecs)
            model = [ldamodel, doc2bow, dictionary]

        else:
            raise ValueError(f"Method {method} is not recognized")

        # Se não estiver fazendo uma versão com tags salva os resultados
        if tag == None and not self.low_memory:
            self.embedings[method] = (vectors, model)

        if return_model:
            return vectors, model
        else:
            return vectors
示例#22
0
# with open(io_file) as t:
#     print(t.read())

# io_file.seek(0)
# print(io_file.read())

# print(os.path.getsize(io_file))

# io_file.close()

# test
import uuid

model_guid = uuid.uuid4()
trained_model = LdaMulticore.load(model_dir)

# save_model(engine, 'model', 'repository',trained_model, model_guid, 'LDA_model', 'AKA - Requests Topic finder')

lda_model = get_model(engine, 'model', 'repository',
                      '55644D1D-D187-4347-8DCD-C94A67F5D5A5')
print(lda_model)

print('COMPLETE')


def test():
    import pickle
    # from sqlalchemy.dialects.mssql import BINARY

    ## Create a semi-complex list to pickle
示例#23
0
文件: lda.py 项目: Vivarta/geiger
class Model():
    """
    LDA (Latent Dirichlet Allocation) model
    for unsupervised topic modeling.

    TO DO:
        - this model has to be rebuilt for each comment section as new comments come in - what's the best way to manage that?

    Notes:
        - tried LDA on individual sentences, doesn't work as well.
    """

    def __init__(self, n_topics=5, verbose=False):
        self.verbose = verbose
        self.n_topics = n_topics
        self.vectr = Vectorizer()

    def train(self, comments):
        """
        Build the topic model from a list of documents (strings).

        Assumes documents have been pre-processed (e.g. stripped of HTML, etc)
        """
        docs = [c.body for c in comments]
        vecs = self.vectr.vectorize(docs, train=True)
        corp = Scipy2Corpus(vecs)
        self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)

        if self.verbose:
            self.print_topics()

    def featurize(self, docs):
        """
        Return topic vectors for documents.
        """
        vecs = self.vectr.vectorize(docs)

        dists = []
        for dist in self.m[Scipy2Corpus(vecs)]:
            dists.append([t[1] for t in dist])
        dists = np.array(dists)
        return dists

    def cluster(self, comments):
        """
        Build clusters out of most likely topics.
        """

        # If no model exists, train it.
        if not hasattr(self, 'm'):
            self.train(comments)

        clusters = [[] for _ in range(self.n_topics)]
        dists = self.featurize([c.body for c in comments])
        for i, comment in enumerate(comments):
            topic = dists[i].argmax()
            clusters[topic].append(comment)

        return clusters

    def identify(self, docs):
        """
        Labels a list of documents with
        their topic and probability for that topic.
        """
        vecs = self.vectr.vectorize(docs)
        dists = self.featurize(docs)
        for i, doc in enumerate(docs):
            topic = dists[i].argmax()
            proba = dists[i][topic]
            yield doc, topic, proba

    def print_topics(self):
        vocab = self.vectr.vocabulary
        for topic in self.m.show_topics(num_topics=self.n_topics, num_words=10, formatted=False):
            print([vocab[int(ix)] for prob, ix in topic])
示例#24
0
def train_lda(dataset_path: str = None,
              print_stats: bool = True,
              save_pp_file: bool = True,
              pp_file_path: str = None,
              run_preprocess: bool = True):
    """
    Train search engine
    """
    if dataset_path is None:
        dataset_path = DATA_PATH

    if pp_file_path is None:
        pp_file_path = PP_DATA_PATH

    if not os.path.isfile(pp_file_path):
        run_preprocess = True

    df = pd.read_csv(dataset_path)
    df.rename({"class": "class_", "name": "sentencia"}, axis=1, inplace=True)

    df = df[[isinstance(x, str) for x in df.text.values]]

    # ## **Crear corpus preprocesado para FastText**
    # #### **Preprocesamiento de texto (para TF-IDF)**
    if run_preprocess:
        logging.info("Preprocesando...")
        with Pool(POOL_SIZE) as my_pool:
            pp_list = my_pool.map(preprocessor_sentences, df.text.values)
        df["pp"] = pp_list
        if save_pp_file:
            df.to_csv(pp_file_path)

    # df["class_"] = pd.Categorical(df["class_"])
    # df.to_csv("data/cc_dump_full_pp_20210508.csv")
    # df = pd.read_csv("data/cc_dump_full_pp_20210508.csv")
    df[df.text.isnull()]
    df[df.pp == ""]
    df = df[df.text.notnull()].copy()
    df[~np.array([isinstance(x, str) for x in df.pp.values])]
    df = df[[isinstance(x, str) for x in df.pp.values]].copy()

    doc_list = df.sentencia.unique().tolist()
    doc_sample = random.sample(doc_list,
                               k=int(len(doc_list) * LDA_SAMPLE_SIZE))
    df_sample = df[df.sentencia.isin(doc_sample)].copy()
    logging.info(f"Tamaño de muestra: {len(df_sample)} elementos (párrafos)")

    # DESCARGAR MEMORIA!
    df = None
    # df_sample["lemma_stop"] = df_sample["pp"]
    df_sample = df_sample[[isinstance(a, str) for a in df_sample.pp.values]]
    df_sample["pp"] = [
        doc.replace("\n", " ").strip() for doc in df_sample.pp.values
    ]
    # create tmp corpus file for yielding
    df_sample.pp.to_csv(LDA_CORPUS_PATH, index=False, header=False)

    # Build dict
    cc_dict = corpora.Dictionary(line.lower().strip().split()
                                 for line in open(LDA_CORPUS_PATH))
    logging.info("Dictionary initial size: {}".format(len(cc_dict)))

    # Filter common words in paragraphs and docs
    # paragraph level
    cc_dict.filter_extremes(no_above=0.5, keep_n=200000)
    cc_dict.compactify(
    )  # remove gaps in id sequence after words that were removed

    # document-level
    doc_dict = {
        doc: ' '.join(df_sample[df_sample.sentencia == doc]["pp"].values)
        for doc in df_sample.sentencia.unique()
    }
    doc_word_dict = {
        d: list(set(doc_dict[d].strip().split()))
        for d in doc_dict
    }
    doc_df = pd.DataFrame([{
        "name": doc,
        "words": ' '.join(doc_word_dict[doc])
    } for doc in doc_word_dict])
    cv = CountVectorizer(max_df=0.5)
    cv.fit(doc_df.words)
    logging.info(f"Document-level stopwords: {len(cv.stop_words_)}")
    stop_ids = [cc_dict.token2id[w] for w in cv.stop_words_]
    cc_dict.filter_tokens(bad_ids=stop_ids)
    cc_dict.compactify()

    # short words
    word_set = list(
        set([w for doc in doc_word_dict for w in doc_word_dict[doc]]))
    word_len = [(w, len(w)) for w in word_set]
    short_words = [word for word, length in word_len if length < 3]
    logging.info(f"Short words to remove: {len(short_words)}")
    short_words = [w for w in short_words if w in cc_dict.token2id.keys()]
    short_ids = [cc_dict.token2id[w] for w in short_words]
    cc_dict.filter_tokens(bad_ids=short_ids)
    cc_dict.compactify()

    # Train
    light_corpus = CorpusIterator(LDA_CORPUS_PATH, cc_dict)
    tfidf = models.TfidfModel(light_corpus)
    corpus_tfidf = TfidfCorpus(light_corpus, tfidf)

    # Train the model on the corpus.
    lda = LdaMulticore(corpus_tfidf,
                       id2word=cc_dict,
                       num_topics=LDA_NUM_TOPICS,
                       workers=LDA_WORKERS,
                       per_word_topics=True,
                       random_state=RANDOM_STATE,
                       passes=1)

    # temp_file = datapath("model")
    cc_dict.save_as_text(LDA_DICT_PATH)
    tfidf.save(LDA_TFIDF_PATH)
    lda.save(LDA_MODEL_PATH)
示例#25
0
 def train_lda(self, num_topics, **kwargs):
     print 'training LDA...'
     self.lda = LdaMulticore(self.bow_corpus, id2word=self.id2word, num_topics=num_topics, **kwargs)
     return self
示例#26
0
# load cleaned corpus
with open('data/cleaned_corpus_broad.pkl', 'rb') as f:
    corpus = pkl.load(f)
with open("data/id2word_broad.pkl", 'rb') as f:
    id2word = pkl.load(f)

# Choose the number of topics
nTopics = 40

# Train the LDA model with a prespecified number of topics
lda_model = LdaMulticore(
    corpus=corpus,
    id2word=id2word,
    num_topics=nTopics,
    random_state=100,
    chunksize=200,
    passes=5000,
    #                                            iterations=10000,
    #                                            minimum_probability=0,
    per_word_topics=True)

# Save the trained LDA model
lda_model.save(f"trained_models/trained_lda_model_new_{lda_model.num_topics}")

# Run the model
doc_lda = lda_model[corpus]

# Extract the topic distributions for each paper as numpy array
hm = np.zeros([len(corpus), lda_model.num_topics])
for i in range(len(doc_lda)):
    for topic_pair in doc_lda[i][0]:
示例#27
0
if __name__ == "__main__":

    #
    # GENSIM TOPIC APPROACH
    #

    dictionary = Dictionary(token_stream(NOVELS_DIRPATH))
    dictionary.filter_extremes(no_below=10, no_above=0.66) # excludes terms like "the", "to", "and", "of", "i", etc.
    print("-------------")
    print("TOKENS", len(dictionary.token2id), list(dictionary.token2id.items())[0:4], "...")

    bags_of_words = [dictionary.doc2bow(tokens) for tokens in token_stream(NOVELS_DIRPATH)]
    print("-------------")
    print("BAGS OF WORDS (CORPUS)", len(bags_of_words), bags_of_words[0])

    lda = LdaMulticore(corpus=bags_of_words, id2word=dictionary, random_state=723812, num_topics=15, passes=10, workers=4)
    print("-------------")
    print("LDA MODEL", type(lda))

    results = lda.print_topics()
    print("-------------")
    print("TOPICS (RAW RESULTS)...")
    print(results)

    parsed_topics = parse_topics(lda)
    print("-------------")
    print("TOPICS (PARSED RESULTS)...")
    pprint(parsed_topics)

    # h/t: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#11createthedictionaryandcorpusneededfortopicmodeling
    topics = lda[bags_of_words]
示例#28
0
                if MODE == 'distributed':
                    lda = LdaModel(corpus=corpus,
                                   num_topics=i,
                                   id2word=dictionary,
                                   distributed=False,
                                   update_every=1,
                                   chunksize=args.chunksize,
                                   passes=1,
                                   iterations=args.iterations,
                                   random_state=args.seed,
                                   eval_every=None)
                elif MODE == 'multicore':
                    lda = LdaMulticore(corpus=corpus,
                                       num_topics=i,
                                       id2word=dictionary,
                                       chunksize=args.chunksize,
                                       workers=3,
                                       passes=1,
                                       iterations=args.iterations,
                                       random_state=args.seed)

                lda_lst.append(lda)

                # Model Evaluation
                cm = CoherenceModel(model=lda,
                                    texts=texts,
                                    corpus=corpus,
                                    dictionary=dictionary,
                                    coherence='c_v')
                coherence = cm.get_coherence()
                coherence_lst.append(coherence)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
else:
    tfidf = TfidfModel(wiki, id2word=dictionary, normalize=True)
    #tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    mm = tfidf[wiki]
    MmCorpus.serialize(outp + '_tfidf.mm', mm, progress_cnt=10000)

logger.info("finished pre-processing, starting LDA %s", program)

lda = LdaMulticore(mm, id2word=dictionary, workers=10, num_topics=ntopics)
lda.save(model_name)
topics = lda.show_topics(num_topics=ntopics, num_words=30)
print(topics)
logger.info("finished LDA %s", program)

toptopics = lda.top_topics(corpus=wiki,
                           dictionary=lda.id2word,
                           coherence='u_mass')
logger.info("top topicsL %s", 'u_mass')
print(toptopics)
示例#30
0
def main():
    df = read_forum_json('json/levergunscommunity.com.json')
    corpus, dictionary = generate_corpus(df)
    lda = LdaMulticore(corpus, num_topics=20, id2word=dictionary, workers=3)
    lda.print_topics(num_topics=20, num_words=20)
示例#31
0
def LDA_topics(corpus, dictionary, num_topics):
    lda_model = LdaMulticore(corpus, id2word=dictionary, num_topics=num_topics, passes=10)
    return lda_model
    print('------>',p[0])

print('3 random reviews with the highest Negative sentiment polarity: \n')
neg = carReviews.loc[carReviews.Vader_Rating <= 2.5, ['EntireReview']].sample(3).values
for n in neg:
    print('------>',n[0])

#LDA Topic Modelling

#Approach 1
reviews = carReviews["ReviewTokens"]
dictionary = corpora.Dictionary(reviews)
#Term document frequency
doc_term_matrix = [dictionary.doc2bow(rev) for rev in reviews]
#perform LDA
ldamodel = LdaMulticore(corpus= doc_term_matrix, num_topics =8, id2word=dictionary,chunksize=2000, passes=20,per_word_topics=True)

#get highlighted topics
topics = ldamodel.show_topics()
lda_display = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False)

#show HTML view
pyLDAvis.save_html(lda_display,open("lda_8_topics.html","w"))

pprint(ldamodel.show_topics(formatted=False))

# Calculate coherence score
def compute_coherence_score(lda_model,reviews):
    coherence = CoherenceModel(lda_model,texts = reviews,dictionary = dictionary ,coherence = "c_v")
    return coherence.get_coherence(),coherence.get_coherence_per_topic()
示例#33
0
class LdaPipeline(Pipeline):
    """Pipeline for creating and updating a gensim LDA model. This is a data
    sink. it does not return any new data
    """
    def __init__(self, num_topics, *args, **kwargs):
        """Initialises the LDA pipeline

        Args:
            num_topics (int): The number of topics in the LDA model
            workers (int): The number of workers to use to training. Defaults to
            num_cores - 1
        """
        super().__init__(*args, **kwargs)
        self._num_topics = num_topics
        self._workers = get_workers()

        # This is only used for lazy loading. Use self.get_model() to ensure it
        # is not None. And to create a new model if one does not exist.
        self._model = None

    @property
    def file_path(self):
        """str: the name of the model's file. It is of the form
        lda-num-topics.model
        """
        file_name = "lda-{num_topics}.model".format(
            num_topics=self._num_topics)
        return get_training_file_path(file_name)

    async def get_model(self):
        """This function is used to get an instance of an LdaModel. It will load
        the model from file if it finds one, otherwise it will create a new one
        using a saved dictionary if one exists. Lastly it will create a new
        dictionary from the corpus if it finds no pre-saved dictionary to use
        and no pre-saved LDA model to load.

        Returns:
            :obj:`gensim.models.ldamodel.LdaModel`: A gensim LdaModel
        """
        self._model = self._model or self._load_model()
        if not self._model:
            print("No previous model found. Creating a new one for training")
            dictionary = await DictionaryPipeline().get_dictionary()
            self._model = LdaMulticore(id2word=dictionary,
                                       workers=self._workers)
        return self._model

    def _load_model(self):
        """This function is used to load a gensim LdaModel from the models
        folder. Or `None` if one does not exist.

        Returns:
            :obj:`gensim.models.ldamodel.LdaModel`: The model found
            in ucla_topic_analysis/model/lda.model or None if there was
            no lda model saved or the number of topics does not match.
        """

        if os.path.isfile(self.file_path):
            return LdaMulticore.load(self.file_path)
        return None

    def save_model(self, file_path=None):
        """Saves the updated model to file
        """
        if self._model is not None:
            path = file_path or self.file_path
            self._model.save(path)
        else:
            raise Exception("Can not save. No model has been loaded.")

    @log_time
    def get_log_perplexity(self, mode):
        """Used to get the log perplexity for the LDA model.

        Args:
            mode (str): The label associated with the files for which to
                calculate the log perplexity.

        Returns:
            int: The log perplexity for the LDA model
        """
        # Get corpus
        corpus = LdaCorpusPipeline(mode=mode)

        # Make sure corpus data has been prepared
        if not os.path.isfile(corpus.get_file_path()):
            raise Exception("No corpus has been prepared.")

        model = self._model or self._load_model()
        if not model:
            raise Exception(
                "No model saved model found. Please tain one first.")
        return model.log_perplexity(corpus)

    @log_async_time
    async def train(self):
        """This function trains an LDA model from the data in the corpus file.
        It will overwrite any existing model and creating a new one if one does
        not exist.
        """
        # Get the dictionary
        dictionary = await DictionaryPipeline().get_dictionary()

        # Get corpus
        corpus = LdaCorpusPipeline()

        # Make sure corpus data has been prepared
        if not os.path.isfile(corpus.get_file_path()):
            await corpus.prepare_data()

        print("Training model. This might take some time")
        model = LdaMulticore(corpus=corpus,
                             num_topics=self._num_topics,
                             id2word=dictionary,
                             workers=self._workers)
        self._model = model
        self.save_model()

    async def coroutine(self, data):
        """Updates the model with the documents in the data. This is a data sink
        it does not return any new data

        Args:
            data (:obj:`list` of :obj:`list` of :obj:`(int, int)`): A list of
                documents, in bag of words representation
        """
        model = await self.get_model()
        model.update(data)
示例#34
0
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_records_filepath))

# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

lda_model_filepath = os.path.join('.', 'lda_model_all_diags')

if rerun:
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)

    lda.save(lda_model_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """

    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')
示例#35
0
]

#loop through input matrices of cancer types and perform LDA
for path in paths:
    project = path.split("/")[:-1].join("/") + "ground.truth.syn.sigs.csv"
    print("Executing LDA algorithm on " + project)

    pd.read_csv()
    num_sigs = projectToSigs[project]
    bagOfMutations, idToChannel = matrixToBag(path)
    classification = list(pd.read_csv(path, sep="\t", usecols=[0]).iloc[:, 0])

    print("Extracting Bayes Signatures")
    ldamodel = LdaMulticore(bagOfMutations,
                            num_topics=num_sigs,
                            id2word=idToChannel,
                            passes=100,
                            iterations=100,
                            minimum_probability=0)

    print("Now Extracting Gibbs Signatures")
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=bagOfMutations,
                                                 num_topics=num_sigs,
                                                 id2word=idToChannel,
                                                 iterations=100,
                                                 topic_threshold=0.0)
    #hdpmodel = HdpModel(bagOfMutations, idToChannel, K=20, T=48)

    pickle.dump(ldamodel,
                open(output_path + project + '_lda_model.pickle', 'wb'))
    pickle.dump(ldamallet,
        #     #     if x in idmap:
        #     #         return x
        #     #     else:
        #     #         return -1
        #     for idx, (doc_id, document) in enumerate(corpus.documents.items()):
        #         if idx % 1000 == 0:
        #             logger.info("remapping: %d documents finished" % idx)
        #         # corpus.documents[doc_id] = [check_and_replace(oldid) for oldid in document]
        #         corpus.documents[doc_id] = [idmap[oldid] for oldid in document if oldid in idmap]

        corpus.save_tbmm_corpus(args.corpus_filename)

        if args.train_lda:
            # from gensim.models.ldamodel import LdaModel
            from gensim.models.ldamulticore import LdaMulticore

            # setting metadata to False is required because of the way logperplexity code requires the
            # output of get_texts to be.
            corpus.metadata = False
            lda = LdaMulticore(workers=19,
                               corpus=corpus,
                               id2word=corpus.dictionary,
                               num_topics=20,
                               eval_every=100,
                               chunksize=100,
                               passes=5)

            lda.print_topics(20)

            lda.save(args.corpus_filename + ".tbmm_lda.model")
示例#37
0
start_time = time.time()
score = cosine_similarity(X[0:10],X)
print("--- %s seconds ---" % (time.time() - start_time))


# calculate similarity and output title and error
x_array = X.toarray()
len(x_array[0])
len(X)


#Train LDA model Take 327 seconds to train the model
start_time = time.time()
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=7,passes=10,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))



# Get all topics from training 
doc_list = []
for var in matutils.Sparse2Corpus(X,documents_columns=False):
    doc_list.append(var)

topic = model.print_topics(num_topics=7, num_words=10)

fin_sum = []
for i in range(len(doc_list)):
from gensim.corpora import Dictionary, HashDictionary, MmCorpus, WikiCorpus
from gensim.models import TfidfModel, LdaModel
from gensim.utils import smart_open, simple_preprocess
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore

wiki_corpus = MmCorpus('Wiki_Corpus.mm')   # Loading the corpus 
print (".... successfully loaded the corpus")

wiki_dict = Dictionary.load('WikiDictionary200k.dict') # Loading the dictionary
print (".... successfully loaded the dictionary")

lda = LdaMulticore(corpus=wiki_corpus, id2word=wiki_dict, num_topics=300, chunksize=10000, passes=2)

print ".... successfully extracted the topics; saving the model"
lda.save('WikiLDA_300.lda')

print "finished ...."
示例#39
0
    doc_tokens = []    
    doc_tokens = prepare_text_for_lda(doc)
    tokens.append(doc_tokens)

# Makes tokens column
df['tokens'] = tokens

id2word = Dictionary(df['tokens'])

id2word.filter_extremes(no_below=2, no_above=.99)

corpus = [id2word.doc2bow(d) for d in df['tokens']]


# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=10, id2word=id2word, workers=12, passes=5)

words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

topics = [' '.join(t[0:10]) for t in words]

# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

p=pyLDAvis.gensim.prepare(base_model, corpus, id2word)
pyLDAvis.save_html(p, 'biden_lda.html')

ldamodel.save('biden_model.gensim')
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence
import pyLDAvis
import pyLDAvis.gensim


def bow(filepath, d):  # output bag of words representation
    for review in LineSentence(filepath):
        yield d.doc2bow(review)


fake_sent = LineSentence('fake.txt')
fake_dict = Dictionary(fake_sent)
fake_dict.filter_extremes(no_below=5, no_above=0.2)
fake_dict.compactify()
fake_dict.save('fake.dict')
fake_dict = Dictionary.load('fake.dict')

MmCorpus.serialize('fake.mm', bow('fake.txt', fake_dict))
fake_corpus = MmCorpus('fake.mm')

fake_lda = LdaMulticore(fake_corpus,
                        num_topics=10,
                        id2word=fake_dict,
                        workers=2)
fake_lda.save('./fake_lda_model')
示例#41
0
# LDA Model Training

# We want to maximize the probability of the corpus in the training set.
corpus = scope_lda_sample.bow

print((
    'LDA Model based on {3} dataset.\n\tSample Size: {0},\n\tTop {1} Words,\n\tNo of Topics {2}'
    .format(sample_size, len(dictionary.values()), num_topics,
            data_scope_name)))

LDAmodel_scope = LdaMulticore(
    corpus=corpus,  #mm,
    id2word=dictionary,
    num_topics=num_topics,
    workers=4,
    chunksize=5000,
    passes=50,
    alpha='asymmetric',
    random_state=random_state)

dictionary.save(
    'data/model/{0}_dictionary.pkl'.format(research_scope))  #data_scope_name))
LDAmodel_scope.save(
    'data/model/{0}'.format(research_scope))  #data_scope_name))
# pickle the model here and insert in SQL
LDAmodel_scope = LdaMulticore.load(
    'data/model/{0}'.format(research_scope))  #data_scope_name))

# Feature vector
df_lda_features(LDAmodel_scope, scope_lda_sample)
示例#42
0
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    other_texts.append(stemmed_tokens)

other_corpus = [dictionary.doc2bow(text) for text in other_texts]

# unseen_doc = other_corpus[2]

# vector = ldamodel[unseen_doc]

# print(vector)

# generate LDA model-------------------------------------------------------------------------

my_loop_num_topics = [2, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]

for i in my_loop_num_topics:
    my_num_topics = i
    print(my_num_topics)
    # ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
    myldamodel = LdaMulticore(corpus,
                              num_topics=my_num_topics,
                              id2word=dictionary,
                              workers=3,
                              alpha=1e-5,
                              eta=5e-1)
    print(myldamodel.print_topics(num_topics=my_num_topics, num_words=5))
    print(myldamodel.log_perplexity(corpus))
    print(myldamodel.log_perplexity(other_corpus))
示例#43
0
    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
my_num_topics = 30
# ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=my_num_topics, id2word = dictionary, passes=20)
ldamodel = LdaMulticore(corpus,
                        num_topics=my_num_topics,
                        id2word=dictionary,
                        workers=3,
                        alpha=1e-5,
                        eta=5e-1)

print(ldamodel.print_topics(num_topics=my_num_topics, num_words=5))
print(corpus[0])
print(corpus[1])
print(corpus[2])
print(ldamodel[corpus[0]])
print(ldamodel[corpus[1]])
print(ldamodel[corpus[2]])

print(ldamodel.print_topics(20))

#----------------------------------------------------------------------
new_texts_set = [
示例#44
0
article_dict = Dictionary.load('articles.dict')


def bow(filepath, d):  # output bag of words representation
    for review in LineSentence(filepath):
        yield d.doc2bow(review)


# generate bag-of-words representations for all reviews and save them as a matrix
MmCorpus.serialize('articles.mm', bow('articles.txt', article_dict))

# load the finished bag-of-words corpus from disk
corpus = MmCorpus('articles.mm')

# Create LDA model
lda = LdaMulticore(corpus, num_topics=10, id2word=article_dict, workers=2)
lda.save('./lda_model')
lda = LdaMulticore.load('./lda_model')

# It's really slow when they're all together for some reason

# For real and fake dataframes
# fake_sent = LineSentence('fake.txt')
# fake_dict = Dictionary(fake_sent)
# fake_dict.filter_extremes(no_below=5, no_above=0.2)
# fake_dict.compactify()
# fake_dict.save('fake.dict')
# fake_dict = Dictionary.load('fake.dict')
#
# real_sent = LineSentence('real.txt')
# real_dict = Dictionary(real_sent)
示例#45
0
    print prod_cat_name_prefix

    # load and build user products
    # products correspond to corpus
    df_user_prods = common.load_df("../data/",
                                   user_corpus_name,
                                   converters={"user_corpus": literal_eval})
    df_user_prods = df_user_prods.set_index('user_id', drop=False)
    user_prods = list(df_user_prods.user_corpus)

    # build lda model
    if __debug__:
        print "In debug mode, None-debug mode command : python -O " + __file__ + "\n\n"
        print "Processing 200 users, 10 topics"
        up_lda = LdaMulticore(corpus=user_prods[0:1000],
                              id2word=id2prod,
                              workers=3,
                              num_topics=20)
        up_lda.save('/tmp/up.lda')
        loaded_up_lda = LdaModel.load('/tmp/up.lda')
        loaded_up_lda.show_topics()
        loaded_up_lda.get_document_topics(user_prods[0])
        loaded_up_lda.get_term_topics(13176)

        # get user cats
        df_user_cat = get_user_cat(loaded_up_lda, df_user_prods.iloc[100:120])

        # get prod cats
        i = 3
        cat_prods = [x[0] for x in loaded_up_lda.get_topic_terms(i, topn=20)]
        df_prod_cat = get_prod_cat(loaded_up_lda, products.loc[cat_prods])
        print df_prod_cat
示例#46
0
文件: __main__.py 项目: healx/agatha
    )
    sentence_ids, text_corpus = bow_util.filter_words(
        keys=sentence_ids,
        text_corpus=text_corpus,
        stopwords=stopwords_under,
    )
    print(f"\t- Reduced to {len(text_corpus)} documents")
    assert len(sentence_ids) == len(text_corpus)

    print("Computing topics")
    word_index = Dictionary(text_corpus)
    int_corpus = [word_index.doc2bow(t) for t in text_corpus]
    topic_model = LdaMulticore(
        corpus=int_corpus,
        id2word=word_index,
        num_topics=config.topic_model.num_topics,
        random_state=config.topic_model.random_seed,
        iterations=config.topic_model.iterations,
    )

    #####################################################
    # Store results
    print("Interpreting")
    result = qpb.TopicQueryResult()
    result.source = config.source
    result.target = config.target

    # Add path
    for p in path:
        result.path.append(p)
示例#47
0
def main(argv):
    """
    The main function of the script.

    Flags:
        --tokenize          : Apply tokenizing and stemming.
        --load-tokenized    : Load existing tokenized and stemmed data.
        --load-lda          : Load existing trained LDA (with --num-topics X topics).
        --load-features     : Load saved features (of type --topic/term/char-features or raw).
        --filter-extremes   : Remove extreme tokens from corpus.
        --filter-types      : Remove only MBTI tokens.
        --topic-features    : Use topic distribution for document as feature vector.
        --term-features     : Use term topics as feature vector, together with TF.
        --char-features     : Use char frequency as feature vector.
        --num-topics X      : X: int, follow after --num-topics arg with space between.
        --normalize         :

    TODO: Write more here.
    """
    setup_logging()

    mbti = {
        'I': 'Introversion',
        'E': 'Extroversion',
        'N': 'Intuition',
        'S': 'Sensing',
        'T': 'Thinking',
        'F': 'Feeling',
        'J': 'Judging',
        'P': 'Perceiving'
    }

    if not "--load-tokenized" in argv:
        data_set = get_data(argv)
        types = sorted(set(data_set["type"]))
        #create_wordclouds(train)

        data_set["posts"].apply(handle_delimiter)

        data_set["posts"] = [post.lower() for post in data_set["posts"]]

        if "--tokenize" in argv or "-t" in argv:

            if "--filter-extremes" in argv:
                filter_level = 'extremes'
            elif "--filter-types" in argv:
                filter_level = 'types'
            else:
                filter_level = None

            data_set["posts"] = tokenize_and_stem(data_set["posts"],
                                                  types,
                                                  create_corpus=True,
                                                  filter_level=filter_level)

            logging.info("Saving tokenized and stemmed data.")
            data_set.to_csv(TOKENIZED_DATA_FILE, index=False)
    else:
        logging.info("Loading tokenized data.")
        data_set = load_csv(TOKENIZED_DATA_FILE)
        data_set["posts"] = [str(post) for post in data_set["posts"]]
        types = sorted(set(data_set["type"]))

    scoring = {
        'acc': 'accuracy',
        'prec_micro': 'precision_micro',
        'rec_micro': 'recall_micro',
        'f1_micro': 'f1_micro'
    }

    prob_scoring = {'neg_log_loss': 'neg_log_loss'}

    X_train, X_test, y_train, y_test = train_test_split(
        data_set["posts"],
        data_set["type"],
        test_size=0.3,
        stratify=data_set["type"],
        random_state=1773)

    print(Counter(y_train), len(y_train))

    corpus = corpora.MmCorpus(CORPUS_FILE)
    dictionary = corpora.Dictionary.load(DICTIONARY_FILE)

    if "--load-lda" in argv:
        logging.info("Loading LDA for %s topics...", NUM_TOPICS)
        lda = LdaMulticore.load(LDA_FOLDER + "lda_model_{}".format(NUM_TOPICS))
    else:
        logging.info("Generating LDA for %s topics...", NUM_TOPICS)
        lda = LdaMulticore(corpus,
                           num_topics=NUM_TOPICS,
                           id2word=dictionary,
                           workers=3,
                           passes=50,
                           batch=True,
                           iterations=500)
        lda.save(LDA_FOLDER + "lda_model_{}".format(NUM_TOPICS))

    #print(lda.print_topics(num_topics=NUM_TOPICS, num_words=30))

    if "--term-features" in argv:
        feature_type = "terms"
    elif "--topic-features" in argv:
        feature_type = "topics"
    elif "--char-features" in argv:
        feature_type = "chars"
    else:
        logging.warning("Default features used (TF-IDF, etc.)")
        feature_type = "raw"

    if not "--load-features" in argv:
        logging.info("Extracting X_train features...")
        X_train_features, save_name = get_features(X_train, feature_type, lda,
                                                   dictionary)
        np.save(X_TRAIN_FOLDER + "X_train_features_{}.npy".format(save_name),
                X_train_features)

        logging.info("Extracting X_test features...")
        X_test_features, save_name = get_features(X_test, feature_type, lda,
                                                  dictionary)
        np.save(X_TEST_FOLDER + "X_test_features_{}.npy".format(save_name),
                X_test_features)
    else:
        logging.info("Loading feature vectors...")
        if feature_type == "chars" or feature_type == "raw":
            file_ending = feature_type
        elif feature_type == "topics" or feature_type == "terms":
            file_ending = "{}_{}".format(feature_type, NUM_TOPICS)
        else:
            raise Exception("Feature type {} is unknown!".format(feature_type))
        X_train_features = np.load(
            X_TRAIN_FOLDER + "X_train_features_{}.npy".format(file_ending))
        X_test_features = np.load(X_TEST_FOLDER +
                                  "X_test_features_{}.npy".format(file_ending))

    if "--normalize" in argv:
        X_train_features = normalize(X_train_features)
        X_test_features = normalize(X_test_features)

    #y_train = [mbti_type[2:4] for mbti_type in y_train]
    #y_test = [mbti_type[2:4] for mbti_type in y_test]

    if feature_type == "raw":
        #"extra_trees" "sgd"
        clf = get_model("etc", argv, X_train_features, y_train, scoring,
                        prob_scoring)
    elif feature_type == "terms":
        clf = get_terms_classifier("logit", scoring, prob_scoring)
    elif feature_type == "topics":
        clf = get_topics_classifier("gradboost", scoring, prob_scoring)
    elif feature_type == "chars":
        clf = get_chars_classifier("linear", scoring, prob_scoring)
    else:
        raise Exception("Feature type {} is unknown".format(feature_type))

    #clf.fit(X_train_features, y_train)
    #logging.info("Best: %s, %s, %s", clf.best_index_, clf.best_score_, clf.best_params_)
    #clf = get_model("lr", argv, X_train, y_train, scoring) # extra_trees sgd lr
    logging.info("Testing clf: %s", clf)
    logging.info("Test set score: %s", clf.score(X_test_features, y_test))
    y_pred = clf.predict(X_test_features)
    logging.info(
        precision_recall_fscore_support(y_test, y_pred, average="micro"))
    print(Counter(y_pred))
    logging.info(classification_report(y_test, y_pred))

    my_own_data = [
        "I'm eagerly waiting for the next development on Social Media Platforms: being able to like likes on your/others' Social Media posts.",
        "The autumn semester was beyond all of my expectations. I've learned a lot of new things and gained new friends from all over the world. A big thank you to all of you for making my time here amazing. I'll never forget you. I hope the spring semester at ETH will be as memorable as the autumn one. <URL>",
        "Month of finals: 6 out of 7 exams done so  far with varying performance and 6 seasons of series completed. It's all about balance in life!",
        "Why do I always eat food while cooking? I'm always full by the time the dish is finished!",
        "Reliving my childhood #harrypotter #game <URL>",
        "Peridot loves Steven #stevenuniverse #pfefferkuchen #pepparkakor #gingerbreadcookies #selfmade #happyholidays <URL>",
        "Guess who's back on twitter? - this neeerd",
        "Java Lecture: When you feel like taking a nap.",
        "A bunch of friends on a friday night playin' #PropHunt! Awesome!",
        "Interesting day :D", "#dhopen @QuanticHyuN GG! you're the best!",
        "I stand by TotalBiscuit and the Terran Republic in the PlanetSide 2 Ultimate Showdown!  #PS2showdown",
        "longing for P tutorials with @ApolloSC2. in the mean time I'll ladder against High Gold/Diamond players! Thanks to you I'm now in gold!! :D",
        "Loosing hard in #SC2, MMMVG and Broodlords are really hard to deal with :(",
        "I uploaded a @YouTube video <URL>  [CoD: WaW] Nostalgia! Quick Match! 30 - 2",
        "I uploaded a @YouTube video <URL>  BF3 Test Footage",
        "Thanks! Now I know! <URL>  #ComputerPowerTest",
        "One does not simply make games without passion.",
        "I uploaded a @YouTube video <URL> Swetrox- - MW3 Game Clip",
        "I uploaded a @YouTube video <URL>  Shatterhand Audio School Project",
        "Finally some spare time! Time for #BF3!!! :D",
        "I uploaded a @YouTube video <URL>  Swetrox- - MW3 Game Clip",
        "I nominate @totalbiscuit for a Shorty Award in #gaming because he delivers entertaining top-quality gaming videos. <URL>",
        "#MW3 released a new game mode for FREE, and MIGHT release some free DLCs! What's that, #BF3? Right, you guys already do that!! :D",
        "Our first duet coming up soon! #LAN and #Singstar <3 The song: The Killers, When we were young!",
        "Land of Confusion! #Singstar",
        "I forgot: I also bought some APELSIN KROKANT! :D",
        "Bought some pizza, 1 Grape Tonic, 1 Grappo and 2 Ciders! #LAN",
        "We're up and running! #LAN time!! Gonna warm up with some #BF3 Wanna join? :D",
        "Would be awesome if I had any spare time to work on my #XNA game! Maybe do some bugfixing or animating the player? :D",
        "Enthusiastic about tomorrow's 18 hour LAN-Party! Gonna play soo many games! #StarCraft2  #BF3 #Sanctum being a few of 'em!",
        "#SC2 Time!",
        "Time to sleep. Tomorrow is a new day, filled with #Skyrim #StarCraft2 #XNA and #Floorball Good night! :D",
        "XNA Time!", "Just picked up mw3 :D"
    ]
    my_own_data = [" ".join(my_own_data)]
    my_own_data = [post.lower() for post in my_own_data]

    if feature_type != "raw":
        my_own_data = tokenize_and_stem(my_own_data, types)
        my_own_data = [" ".join(my_own_data)]

    logging.info("Extracting features from my own data...")
    my_own_data_features = get_features(my_own_data, feature_type, lda,
                                        dictionary)[0]
    if "--normalize" in argv:
        my_own_data_features = normalize(my_own_data_features)
    print(my_own_data_features)
    print(clf.predict(my_own_data_features))
    try:
        predicted_classes = clf.predict_proba(my_own_data_features)
        pprint(list(zip(types, predicted_classes[0])))
    except Exception:
        print("Predict probabilities not supported...")
    return 0
示例#48
0
文件: lda.py 项目: frnsys/geiger
 def train(self, vecs):
     """
     Build the topic model.
     """
     corp = Scipy2Corpus(vecs)
     self.m = LdaMulticore(corp, num_topics=self.n_topics, iterations=1000, workers=3)
示例#49
0
# N = len(alldata)
# ii=800000
# ff=ii+20000
# while ff<N:
#     aa = reviewPreProcess(alldata['FullReview'][ii:ff])
#     ii=ff
#     ff=ii+20000
#     print(ff)
# else:
#     aa = reviewPreProcess(alldata['FullReview'][ii:N])
d = reviewPreProcess(alldata['FullReview'])
# bigram_model, trigram_model, trigram_dictionary = reviewPreProcess(alldata['FullReview'])
trigram_bow_corpus, lda = LDA_Model(15)
import pickle
trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')
trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')
lda = LdaMulticore.load('./models2/lda_model')

LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus, trigram_dictionary)

# Save pre-prepared pyLDAvis data to disk:
with open('./models2/ldavis_prepared', 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

# load the pre-prepared pyLDAvis data from disk:
with open('./models2/ldavis_prepared', 'rb') as f:
    LDAvis_prepared = pickle.load(f)

# pyLDAvis.display(LDAvis_prepared)
pyLDAvis.save_html(LDAvis_prepared,'./models2/lda.html')
示例#50
0
class recommendationsys_LDA:
    def __init__(self, ngram):
        # load the spacy english model
        self.nlp = spacy.load('en')
        
        self.extrawords = ["'s", "st", "th", "’s", "-PRON-", "’", "htt", "ht", "km", "pm", "am"]
        
        # parse the latest emoji code
        html = str(ur.urlopen('http://www.unicode.org/Public/emoji/5.0/emoji-data.txt').read())
        codes=list(map(lambda x: '-'.join(['\\U'+a.zfill(8) for a in x.split('..')]).encode().decode('unicode-escape'),re.findall(r'(?<=\\n)[\w.]+',html)))
        self.emojiPattern = re.compile('['+','.join(codes)+']',flags=re.UNICODE)
        
        PROJECT_DIRECTORY = 'output/project/' + project_name

        self.f_titles = PROJECT_DIRECTORY + '/titlesLF_target.txt'

        self.f_authors = PROJECT_DIRECTORY + '/authors_target.txt'
        
        self.authorcontent_clean = {}
        
        self.ngram_bow_corpus = []
        
        self.ldavec = {}
        
        self.ngram_dictionary = None
        
        self.ngram = ngram
        self.num_topics = None

    def clean_text(self, text):
    
        # remove the 'RT' and replace '\n' to '.' 
        text = text.lower()
        #text = text.replace('RT',' ')
        text = text.replace('\n',' . ')    
    
        # this is for USC-2
        # remove emojis
        myre = re.compile(u'('
                         '@\S*\s?|#|'   # remove @ mention names and hastag sign
                         'http[s]?[:…]+\S*\s|' # remove url
                         '[-~\^\$\*\+\{\}\[\]\\\|\(\)/“"]|'
                         'rt[:]? |'
                         '…'
                         ')+', re.UNICODE)

        text = myre.sub(' ', text)
        text = self.emojiPattern.sub(' ', text)

        text = text.replace('&amp;','and')
        
        
        
                
    

        #text = ' '.join(text)
        


        
        return text


#---------------------------
# make the recommendations
#---------------------------
    def recomendation(self, username, topicn=0, list=[]):
        
        similaritis = self.ldacosinesimilarity(username,topicn)
        result=[]
        # list is empty, run on the whole dataset
        if not list:
            for key, value in sorted(similaritis.items(), key=lambda x:x[1]):
                result.append((key,value))
        else:
            for i in list:
                result.append((i,similaritis[i]))
            
            # sort the result by similarities
            result = sorted(result, key=lambda x:x[1])

#---------------------------
# load and clean the data
#---------------------------
    def loadandclean(self, n=-1):

        #authorcontent = {}

        # ------
        with codecs.open(self.f_titles, encoding='utf_8') as f_t:
            with codecs.open(self.f_authors, encoding='utf_8') as f_a:
                for l_a, l_t in zip(f_a, f_t):
                    # remove the '\n' at the end
                    key = l_a[:-1].lower()
            
                    l_t = self.clean_text(l_t)
                    if key in self.authorcontent_clean:
                        
                        self.authorcontent_clean[key].append(l_t)
                        #self.authorcontent_clean[key] = self.clean_text(value)
                    else:
                        
                        self.authorcontent_clean[key] = [l_t]
                        #self.authorcontent_clean[key] = self.clean_text(value)
                    
                    if n != -1 and len(self.authorcontent_clean) == n:
                        break
        # ---------------                
        

        for key, value in self.authorcontent_clean.items():
           self.authorcontent_clean[key] = self.lemmatized_sentence_corpus(self.authorcontent_clean[key])


    
#------------------------------------------------------
# build the trigram content based on the clean content
#------------------------------------------------------
    
    def punct_space(self, token):
        """
        helper function to eliminate tokens
        that are pure punctuation or whitespace
        """
        #return token.pos_ == 'NOUN' or token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2
        return token.is_punct or token.is_space or token.lemma_ in spacy.lang.en.STOP_WORDS or token.lemma_ in self.extrawords or len(str(token)) < 2

    def lemmatized_sentence_corpus(self, contents):
        """
        generator function to use spaCy to parse reviews,
        lemmatize the text, and yield sentences
        """
        sentents = []
    
        for content in self.nlp.pipe(contents,batch_size=500, n_threads=8):
        
            for sent in content.sents:
                #sentents.append(u' '.join([token.lemma_ for token in sent
                #                 if not punct_space(token)]))
                #sentents.append([token.lemma_ for token in sent
                #                 if not punct_space(token)])
                tokens = []
                for token in sent:
                    if self.punct_space(token):
                        continue
                
                    #if token.lemma_ == '-PRON-':
                    #    token.lemma_ = token.lower_
                    tokens.append(token.lemma_)
                
                sentents.append(tokens)
                    
        return sentents

    """
    prepare the parameters for lda
    """
    def ldainit(self):
        
#        self.num_topics = num_topics
#        ngram = self.ngram
#        # if ngram_bow_corpus is empty, build it first
#        if not self.ngram_bow_corpus: 
        
        self.user_sentences = self.authorcontent_clean
        self.user_bigramsentences = {}
        self.all_sentences = []
        self.all_bigram_sentences = []
        
        sentences = list(self.authorcontent_clean.values())
        self.all_sentences = [item for sublist in sentences for item in sublist]
        
        # buld bigram model 
        if self.ngram == 2:
            self.bigram_model = Phrases(self.all_sentences)
            for user,content in self.user_sentences.items():
                bigram_s = []
                for s in content:
                    bigram_s.append(self.bigram_model[s])
                self.user_bigramsentences[user] = bigram_s
                self.all_bigram_sentences += self.user_bigramsentences[user]
                
            
            
    def trainlda(self, topics_n = 10):
        self.num_topics = topics_n
        
        alltexts = []
        for name,sentences in self.user_sentences.items():
            sentences = [item for sublist in sentences for item in sublist]
            alltexts.append(sentences)
        
        
#        if self.ngram_dictionary == None:
#            if self.ngram == 1:
#                self.ngram_dictionary = Dictionary(self.all_sentences)
#            elif self.ngram == 2:
#                self.ngram_dictionary = Dictionary(self.all_bigram_sentences)
#                
        if self.ngram_dictionary == None:
            if self.ngram == 1:
                self.ngram_dictionary = Dictionary(alltexts)
            elif self.ngram == 2:
                self.ngram_dictionary = Dictionary(alltexts)
                
            # filter tokens that are very rare or too common from
            # the dictionary (filter_extremes) and reassign integer ids (compactify)
            self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8)
            self.ngram_dictionary.compactify()


#        if self.ngram == 1:
#            sentences = self.all_sentences
#        elif self.ngram == 2:
#            sentences = self.all_bigram_sentences
            
#        ngram_bow_corpus = []
#        for sentence in sentences:
#            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))
#
#
#        self.lda = LdaMulticore(ngram_bow_corpus,
#                           num_topics = topics_n,
#                           id2word=self.ngram_dictionary,
#                           workers=3)
        

            
        ngram_bow_corpus = []
        for sentence in alltexts:
            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))


        self.lda = LdaMulticore(ngram_bow_corpus,
                           num_topics = topics_n,
                           id2word=self.ngram_dictionary,
                           workers=3)    
        
        
                # calculate the cohe
        topics=[]

        for i in range(self.lda.num_topics):
            terms = []
            for n in self.lda.show_topic(i):
                terms.append(n[0])
            topics.append(terms)
        
        cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass')
        cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v')
        cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci')
        cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi')

        return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()

        
    def explore_topic(self, topic_number, topn=25):
        """
        accept a user-supplied topic number and
        print out a formatted list of the top terms
        """
        
        print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

        for term, frequency in self.lda.show_topic(topic_number, topn):
            print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))



    def runlda(self, username):
        
        if self.ngram == 1:
            user_sentences = self.user_sentences[username]
        elif self.ngram == 2:
            user_sentences = self.user_bigramsentences[username]
        
        # flat the list of list into single list
        user_sentences = [item for sublist in user_sentences for item in sublist]
        user_bow = self.ngram_dictionary.doc2bow(user_sentences)

        user_lda = self.lda[user_bow]

        #user_lda = sorted(user_lda, key=lambda x:-x[1])
        
        return user_lda

    """
    compute the lda topic vec for every one 
    """
    def runldavec(self):
        if not self.ldavec:
            for key, value in self.user_sentences.items():
                vec = np.zeros(self.num_topics)
                result = self.runlda(key)
                for i in result:
                    vec[i[0]] = i[1]
                self.ldavec[key] = vec
                
            
    """
    """
    def runtopntopic(self, n):
        self.topntopics = []
        
        for key, value in self.ldavec.items():
            idx = value.argsort()
                
            self.topntopics += list(idx[-n:])
        
        self.topntopics = list(set(self.topntopics))
    
    """
    compute the lda cosine similarity between a given user and the rest users
    """
    def ldacosinesimilarity(self, username, topn=0):
        if username not in self.authorcontent_clean:
            print('The user cannot find')
            return
        if topn < 0:
            print('topn should be >= 0')
            return
        
        topn = int(topn)
        
        cosinesimilaritydic = {}
        
        if not self.ldavec:
            self.runldavec()
        
        if topn == 0:
            usertopicvec = self.ldavec[username]
        else:
            self.runtopntopic(topn)
            usertopicvec = self.ldavec[username][self.topntopics]
            
        for key, value in self.ldavec.items():
            if key != username:
                if topn == 0:
                    pairtopicvec = value
                else:
                    pairtopicvec = value[self.topntopics]
                cosinesimilarity = pairwise_distances(np.array(usertopicvec).reshape(1,-1),np.array(pairtopicvec).reshape(1,-1), metric='cosine')[0][0]
                cosinesimilaritydic[key] = cosinesimilarity
                
        return cosinesimilaritydic
示例#51
0
import requests

import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary

from '../../nlp_utils' import lemmatize_stemming, preprocess, text_from_html, tag_visible

#link = "https://www.nytimes.com/2018/06/12/opinion/earth-will-survive-we-may-not.html"
#link = "https://www.slowtwitch.com/Products/Tri_Bike_by_brand/Specialized/S-Works_Shiv_Disc_7053.html"

#html = requests.get(link).text
#article = text_from_html(html)
article = "How a Pentagon deal became an identity crisis for Google"

lda_model_tfidf = LdaMulticore.load("kaggle_lda_tfidf")
dictionary = Dictionary.load('kaggle_dict')

bow_vector = dictionary.doc2bow(preprocess(article))
for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

示例#52
0
import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

# In[26]:

lda_model_filepath = 'lda_model_eat_30'
trigram_dictionary_filepath = 'trigram_dict_eat_30.dict'
trigram_model_filepath = 'trigram_model_all_eat_30'
bigram_model_filepath = 'bigram_model_all_eat_30'

# In[27]:

lda = LdaMulticore.load(lda_model_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

all_numbers = list(range(0, 50))
df_all_numbers = pd.DataFrame(columns=["topic_number"])
for topic_number in all_numbers:
    df_all_numbers = df_all_numbers.append({"topic_number": topic_number},
                                           ignore_index=True)

# In[32]:

#output_file = open('review_bus_lda.txt','w')

# In[ ]:
示例#53
0
eval_every = None
workers = 6
random_state_list = [7,14,21,28]
# random_state  = 42

for data in dataset:
    for n_topics in num_topics_list:
        for random_state in random_state_list:
            starttime = datetime.datetime.now()
            print('dataset:', data, 'num_topics:', n_topics)
            data_dir = './%s_data'%data
            dictionary = Dictionary.load(os.path.join(data_dir, 'ne_weighting.dict'))
            bow_news = load_model(os.path.join(data_dir, 'ne_weighting.bow'))
            dict_id2token = dict(dictionary.items())

            lda = LdaMulticore(bow_news, id2word=dict_id2token, num_topics=n_topics, passes=passes, iterations=iterations,\
                               eval_every=eval_every, workers=workers, random_state=random_state)

            #print(lda.show_topics(num_topics=num_topics, num_words=20))

            name = 'ne_topic%s_passes%s_iteration%s_random%s' % (n_topics, passes, iterations, random_state)
            result_dir = os.path.join(data_dir, name)
            if not os.path.exists(result_dir):
                os.mkdir(result_dir)

            lda.save(os.path.join(result_dir, 'lda_model'))

            topics = lda.show_topics(num_topics=n_topics, num_words=20, log=False, formatted=False)
            # 输出主题
            with open(os.path.join(result_dir, 'topics.txt'), 'w', encoding='utf-8') as f:
                for topic in topics:
                    f.write('topic ' + str(topic[0]) + ':\n')
示例#54
0
 def load_model(self, model_path, wrd2idx_path):
     self.model = LdaMulticore.load(model_path)
     with open(wrd2idx_path) as fid:
         wrd2idx = cPickle.load(fid)
     self.load_vocabulary(wrd2idx)
 
res = fin_res
len(res)
res[0]
#==============================================================================
# #Train LDA model Take 1655 seconds to train the model
#==============================================================================
# No need to run LDA everytime, model has bee stored
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(res)
vocab = vectorizer.get_feature_names()
start_time = time.time()
model = LdaMulticore(
                    matutils.Sparse2Corpus(X,documents_columns=False), 
                    num_topics=9,passes=10,
                    chunksize=5000,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]),
                    workers=7,
                    )
print("--- %s seconds ---" % (time.time() - start_time))
fname = '/Users/royyang/Desktop/trending_project/re_categorization_ls/LDA_9topics'
model.save(fname)

#Load a pretrained model
model = LdaModel.load(fname, mmap='r')
type(model)

#==============================================================================
# # Get all topics from training 
# topic_number, number_of_aritcles, top_words
#==============================================================================
示例#56
0
    def trainlda(self, topics_n = 10):
        self.num_topics = topics_n
        
        alltexts = []
        for name,sentences in self.user_sentences.items():
            sentences = [item for sublist in sentences for item in sublist]
            alltexts.append(sentences)
        
        
#        if self.ngram_dictionary == None:
#            if self.ngram == 1:
#                self.ngram_dictionary = Dictionary(self.all_sentences)
#            elif self.ngram == 2:
#                self.ngram_dictionary = Dictionary(self.all_bigram_sentences)
#                
        if self.ngram_dictionary == None:
            if self.ngram == 1:
                self.ngram_dictionary = Dictionary(alltexts)
            elif self.ngram == 2:
                self.ngram_dictionary = Dictionary(alltexts)
                
            # filter tokens that are very rare or too common from
            # the dictionary (filter_extremes) and reassign integer ids (compactify)
            self.ngram_dictionary.filter_extremes(no_below=10, no_above=0.8)
            self.ngram_dictionary.compactify()


#        if self.ngram == 1:
#            sentences = self.all_sentences
#        elif self.ngram == 2:
#            sentences = self.all_bigram_sentences
            
#        ngram_bow_corpus = []
#        for sentence in sentences:
#            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))
#
#
#        self.lda = LdaMulticore(ngram_bow_corpus,
#                           num_topics = topics_n,
#                           id2word=self.ngram_dictionary,
#                           workers=3)
        

            
        ngram_bow_corpus = []
        for sentence in alltexts:
            ngram_bow_corpus.append(self.ngram_dictionary.doc2bow(sentence))


        self.lda = LdaMulticore(ngram_bow_corpus,
                           num_topics = topics_n,
                           id2word=self.ngram_dictionary,
                           workers=3)    
        
        
                # calculate the cohe
        topics=[]

        for i in range(self.lda.num_topics):
            terms = []
            for n in self.lda.show_topic(i):
                terms.append(n[0])
            topics.append(terms)
        
        cm_umass = CoherenceModel(topics=topics, corpus=ngram_bow_corpus, dictionary=self.ngram_dictionary, coherence='u_mass')
        cm_cv = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_v')
        cm_cuci = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_uci')
        cm_cnpmi = CoherenceModel(topics=topics, texts=alltexts, dictionary=self.ngram_dictionary, coherence='c_npmi')

        return topics_n, cm_umass.get_coherence(), cm_cv.get_coherence(),cm_cuci.get_coherence(),cm_cnpmi.get_coherence()