def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def main(): X = np.array([[0, 1, 0, 2, 2, 0], [1, 0, 1, 1, 3, 3]]) olda = OnlineLDA(n_topics=2) olda.partial_fit(X) print(olda.lambda_) lda = LatentDirichletAllocation(n_topics=2, total_samples=2) lda.partial_fit(X) print(lda.components_)
def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_topics = rng.randint(3, 6) lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng) lda.partial_fit(X) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_lda_partial_fit_dim_mismatch(): # test `n_features` mismatch in `partial_fit` rng = np.random.RandomState(0) n_topics = rng.randint(3, 6) n_col = rng.randint(6, 10) X_1 = np.random.randint(4, size=(10, n_col)) X_2 = np.random.randint(4, size=(10, n_col + 1)) lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5., total_samples=20, random_state=rng) lda.partial_fit(X_1) assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_lda_transform_mismatch(): # test `n_features` mismatch in partial_fit and transform rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) X_2 = rng.randint(4, size=(10, 8)) n_components = rng.randint(3, 6) lda = LatentDirichletAllocation(n_components=n_components, random_state=rng) lda.partial_fit(X) with pytest.raises(ValueError, match=r"^The provided data has"): lda.partial_fit(X_2)
def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=-1, learning_offset=5., total_samples=30, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_offset=5., total_samples=30, random_state=rng) for i in range(2): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert tuple(sorted(top_idx)) in correct_idx_grps
def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., total_samples=100, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def test_lda_partial_fit(): # Test LDA online learning (`partial_fit` method) # (same as test_lda_batch) rng = np.random.RandomState(0) n_topics, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=10., total_samples=100, random_state=rng) for i in xrange(3): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def topic_modeling(gen, n_components): """ Takes in a cursor generator and number of componenets for LDA and returns topics """ count_vectorizer = CountVectorizer(ngram_range=(1, 2), stop_words='english', token_pattern="\\b[a-z][a-z]+\\b", lowercase=True, max_df = 0.6) count_vectorizer.fit(gen) lda = LatentDirichletAllocation(n_components) for _ in range(10): for file in gen: vec_file = count_vectorizer.transform([file]) lda.partial_fit(vec_file) return(display_topics(lda, count_vectorizer.get_feature_names(), 10))
def iter_epochs(n_word_types, docs, n_topics, seed): D = len(docs) V = n_word_types docs = [list(Counter(doc).items()) for doc in docs] X = lil_matrix((D, V), dtype=np.int) for d, doc in enumerate(docs): for v, c in doc: X[d, v] = c X = X.tocsr() model = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=seed) while True: start_time_s = time.time() model.partial_fit(X) processing_time_s = time.time() - start_time_s phikv = model.components_ yield dict(topic_word_distribution=phikv, processing_time_s=processing_time_s)
def main(): X = np.array([[1, 1, 1, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1, 1, 2, 1]]) olda = OnlineLDA(n_topics=2, tau0=80) olda.partial_fit(X) print(olda.lambda_) lda = LatentDirichletAllocation(n_topics=2, total_samples=2, learning_offset=80, learning_decay=0.8, mean_change_tol=0.00001, max_iter=10000) lda.fit(X) print(lda.perplexity(X)) lda = LatentDirichletAllocation(n_topics=2, total_samples=2, learning_offset=80, learning_decay=0.8, mean_change_tol=0.00001, max_iter=10000) lda.partial_fit(X) print(lda.perplexity(X))
class ScikitLda(object): def __init__(self, corpus=None, lda=None, n_topics=10, max_iter=5, learning_method='online', learning_offset=50., **kwargs): if lda is None: self.lda = LatentDirichletAllocation( n_topics=n_topics, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, **kwargs) else: self.lda = lda self._corpus = corpus self._weights = None def fit(self): self.lda.fit(self.corpus.sparse_matrix()) def partial_fit(self, corpus): self.lda.partial_fit(corpus.sparse_matrix()) self._weights = None @property def topics(self): return self.lda.components_ @property def n_topics(self): return self.lda.n_topics @property def corpus(self): return self._corpus @property def weights(self): if self._weights is None: self._weights = self.partial_weights(self.corpus) return self._weights def partial_weights(self, corpus): weights = self.transform(corpus) return (weights.T / weights.sum(axis=1)).T def transform(self, corpus): return self.lda.transform(corpus.sparse_matrix()) def topic_words(self, n_words=10): topicWords = [] topicWeightedWords = [] for topic_idx, topic in enumerate(self.topics): weightedWordIdx = topic.argsort()[::-1] wordsInTopic = [self.corpus.word(i) for i in weightedWordIdx[:n_words]] weights = topic / topic.sum() topicWeights = [(weights[i], self.corpus.word(i)) for i in weightedWordIdx[:n_words]] topicWords.append(wordsInTopic) topicWeightedWords.append(topicWeights) return (topicWords, topicWeightedWords) def save(self, filename): joblib.dump(self.lda, filename) @classmethod def load(cls, filename, corpus=None): lda = joblib.load(filename) return cls(lda=lda, corpus=corpus)
def main(): from neomodel import config, db from model.graph import connection_url from model.graph.spotify.track import Track from model.graph.spotify.playlist import Playlist from sklearn.neural_network import MLPClassifier from sklearn.linear_model import Perceptron, SGDClassifier from sklearn.multioutput import MultiOutputClassifier from sklearn.preprocessing import MultiLabelBinarizer from sklearn.preprocessing import normalize from sklearn.decomposition import LatentDirichletAllocation from sklearn.datasets import make_multilabel_classification from joblib import dump, load from tqdm import tqdm import numpy as np from math import log10 import os config.DATABASE_URL = connection_url() db.set_connection(connection_url()) stopval = len(Track.nodes) print(stopval) print('Playlists', len(Playlist.nodes)) playlists = Playlist.get_all() num_playlists = len(playlists) playlists = {node.uri: ind for ind, node in enumerate(playlists)} def get_minibatches(stopval, count=0, interval=20): while count < stopval: to_analyze: List[Track] = Track.get_songs_in_playlists( interval, count) X = [a.get_song_features(as_list=True) for a in to_analyze] y = [[playlists[x.uri] for x in Track.get_playlists(a.spotify_id)] for a in to_analyze] print(count, interval) yield np.array(list( map(lambda x: list(map(abs, x)), X))), MultiLabelBinarizer().fit_transform(y) count += len(to_analyze) lda = LatentDirichletAllocation(n_components=num_playlists) startval = 0 if len(os.listdir('trained_models')) > 0: startval = max(os.listdir('trained_models')) lda = load(os.path.join('trained_models', startval)) startval = int(startval.split('.')[0]) interval = 20 # get_minibatches(stopval, count=startval * interval) for i, val in enumerate( tqdm(get_minibatches(stopval, count=startval * interval))): i += startval + 1 X, y = val lda.partial_fit(X) dump( lda, os.path.join('trained_models', f'{str(i).zfill(int(log10(stopval) + 1))}.joblib')) os.remove( os.path.join( 'trained_models', f'{str(i - 1).zfill(int(log10(stopval) + 1))}.joblib'))
def fast_lda_topics(X, n_components: int = 10, batch_size=128, max_iter=100, doc_topic_prior=None, topic_word_prior=None, learning_decay=.7, learning_offset=10., total_samples=1e6, max_doc_update_iter=100, n_jobs=2, random_state=1) -> LatentDirichletAllocation: r""" Latent dirichlet allocation using online variational Bayes method. In each EM update, use mini-batch of training data to update the ``components_`` variable incrementally. The learning rate is controlled by the ``learning_decay`` and the ``learning_offset`` parameters. Arguments: n_components : int, optional (default=10) Number of topics. doc_topic_prior : float, optional (default=None) Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. topic_word_prior : float, optional (default=None) Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. learning_decay : float, optional (default=0.7) It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. literature, this is called kappa. learning_offset : float, optional (default=10.) A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. max_iter : integer, optional (default=10) The maximum number of iterations. batch_size : int, optional (default=128) Number of documents to use in each EM iteration. Only used in online learning. total_samples : int, optional (default=1e6) Total number of documents. Only used in the :meth:`partial_fit` method. mean_change_tol : float, optional (default=1e-3) Stopping tolerance for updating document topic distribution in E-step. max_doc_update_iter : int (default=100) Max number of iterations for updating document topic distribution in the E-step. n_jobs : int or None, optional (default=None) The number of jobs to use in the E-step. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary <n_jobs>` for more details. random_state : int, RandomState instance, default=None Pass an int for reproducible results across multiple function calls. See :term:`Glossary <random_state>`. """ lda = LatentDirichletAllocation( n_components=n_components, batch_size=batch_size, max_iter=max_iter, doc_topic_prior=doc_topic_prior, topic_word_prior=topic_word_prior, learning_method='online', learning_decay=learning_decay, learning_offset=learning_offset, total_samples=total_samples, max_doc_update_iter=max_doc_update_iter, n_jobs=n_jobs, verbose=False, random_state=random_state, ) prog = tqdm(desc="Perp(None)", total=max_iter) if isinstance(X, (tf.Tensor, tf.SparseTensor)): X = X.numpy() if isinstance(X, (np.ndarray, sparse.spmatrix)): for it in range(max_iter): lda.partial_fit(X) prog.update(1) elif isinstance(X, DatasetV2): for it, x in enumerate( X.repeat(-1).shuffle(100) if hasattr(X, 'repeat') else X): if it >= max_iter: break if isinstance(x, (tuple, list)): x = x[0] lda.partial_fit(x.numpy()) if it % 10 == 0: perp = lda.perplexity(x) prog.desc = f"Perp({perp:.2f})" prog.update(1) prog.close() return lda
class HarmonizeClassRasters: def __init__(self, class_code2vocab, class_errmat, vocab_creation="union", **kwargs): # class_code2vocab: list of dict, class_code_within_each_raster:common_class_code_or_name_across_rasters # class_errmat: list of pandas DataFrame. # vocab_creation: "union" or "combination", options to create vocabulary from the input class labels of two or more rasters. # # kwargs: keyword arguments for the LDA model, # LatentDirichletAllocation[http://scikit-learn.org/dev/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation] # if scikit-learn package; LDA[https://lda.readthedocs.io/en/stable/] # if lda package self._vocab_union = 1 self._vocab_combination = 2 self.class_code2vocab = [pd.Series(cv) for cv in class_code2vocab] errmat_in_vocab = [] if class_errmat is None: errmat_in_vocab = [ pd.DataFrame(np.eye(len(set(cv.values()))), set(cv.values()), set(cv.values())) for cv in class_code2vocab ] else: for cv, old_em in zip(class_code2vocab, class_errmat): em = old_em.copy() em.index = pd.Series(cv).reindex(em.index).values em.columns = pd.Series(cv).reindex(em.columns).values errmat_in_vocab.append(em) self.class_errmat = errmat_in_vocab self.kwargs = kwargs # generate all the unique classes (words) as the vocabulary if vocab_creation == "union": self.vocab_creation = self._vocab_union self.vocab = list( set( list( itertools.chain( *[c2v.values() for c2v in class_code2vocab])))) self._dw = pd.Series(np.zeros(len(self.vocab)), index=pd.Index(self.vocab)) self._m2t_prob = None elif vocab_creation == "combination": self.vocab_creation = self._vocab_combination self.vocab = list( itertools.product( *[set(c2v.values()) for c2v in class_code2vocab])) index = pd.MultiIndex.from_tuples(self.vocab) self._dw = pd.Series(np.zeros(len(self.vocab)), index=index) if class_errmat is None: m2t_prob = None # pd.DataFrame(np.eye(len(self.vocab)), index=index, columns=index) else: m2t_prob = pd.DataFrame(np.zeros( (len(self.vocab), len(self.vocab))), index=index, columns=index) em_col_comb = list( itertools.product( *[em.columns.values for em in errmat_in_vocab])) for idx in m2t_prob.index.values: m2t_list = [ errmat_in_vocab[i].loc[val, :] / errmat_in_vocab[i].loc[val, :].sum() for i, val in enumerate(idx) ] m2t_list = list(zip(*itertools.product(*m2t_list))) m2t = m2t_list[0] for val in m2t_list[1:]: m2t = np.multiply(m2t, val) m2t_prob.loc[idx, em_col_comb] = m2t self._m2t_prob = m2t_prob else: raise RuntimeError("Unknown option for vocabulary creation") self.lda = LatentDirichletAllocation(**kwargs) def _translateArray(self, img, code2vocab): # img: 2D array # code2vocab: pandas series to translate class codes (indexes of the # series) to vocabulary codes (values of the series). out = img.copy() for idx, v in code2vocab.items(): if idx != v: out[img == idx] = v return out def genDocWordFromArray(self, multiband_img, use_errmat=True, N_factor=1): self._dw[:] = 0 img_list = [] for ib in range(multiband_img.shape[2]): img = multiband_img[:, :, ib] img_list.append( self._translateArray(img, self.class_code2vocab[ib])) if self.vocab_creation == self._vocab_union: for ib, words in enumerate(img_list): uw, uc = np.unique(words, return_counts=True) uw_mask = np.ones_like(uw, dtype=np.bool) for v in set(uw) - set(self.vocab): uw_mask = np.logical_and(uw_mask, uw != v) uw = uw[uw_mask] uc = uc[uw_mask] n_words = np.sum(uc) if use_errmat: # Do adjustment of word counts according to error matrix em = self.class_errmat[ib] em_row = em.loc[uw, :].values tmp = em_row / np.tile( np.sum(em_row, axis=1)[:, np.newaxis], (1, em_row.shape[1])) # Calculate the proportion of vocabulary words in this image # Later N_factor multiplication gives word counts that # create a document of this designated number of words. # This can be used to have all the documents of the same # lengths/word counts in the LDA training. uc = np.matmul(uc, tmp) uw = em.columns # Calculate the proportion of vocabulary words in this image # Later N_factor multiplication gives word counts that create a # document of this designated number of words. This can be # used to have all the documents of the same lengths/word # counts in the LDA training. self._dw.loc[uw] += uc / n_words elif self.vocab_creation == self._vocab_combination: uw, uc = np.unique(np.asarray( list(zip(*[img.flatten() for img in img_list]))), axis=0, return_counts=True) uw_mask = np.ones(uw.shape[0], dtype=np.bool) for v in set([tuple(val) for val in uw.tolist()]) - set(self.vocab): uw_mask = np.logical_and( uw_mask, np.all(uw != np.tile(v, (uw.shape[0], 1)), axis=1)) uw = uw[uw_mask, :] uc = uc[uw_mask] n_words = np.sum(uc) uw = [tuple(val) for val in uw] if use_errmat and (self._m2t_prob is not None): uc = np.matmul(uc, self._m2t_prob.loc[uw, :]) uw = self._m2t_prob.columns.values # Calculate the proportion of vocabulary words in this image # Later N_factor multiplication gives word counts that # create a document of this designated number of words. # This can be used to have all the documents of the same # lengths/word counts in the LDA training. self._dw.loc[uw] = uc / n_words else: raise RuntimeError("Unknown option for vocabulary creation.") return self._dw.values.copy() * N_factor def fitTopicModel(self, X, partial=True): if partial: self.lda.partial_fit(X) else: self.lda.fit(X) def getTopicWordDist(self): return self.lda.components_ def estDocTopicDist(self, X): return self.lda.transform(X) def estHarmonized(self, mb_img, img_mask, N_factor=1, class_nodata=0, prob_nodata=0): # img_mask: valid being 1 and invalid (not to be processed) being 0. win_ysize, win_xsize, nbands = mb_img.shape pixel_mask = img_mask.ravel() pixel_word = np.array( [mb_img[:, :, ib].ravel() for ib in range(nbands)]).T pixel_prob = np.zeros( (len(pixel_mask), self.lda.n_components)) + prob_nodata pixel_class = np.zeros(len(pixel_mask)) + class_nodata if np.sum(pixel_mask) > 0: pixel_prob[pixel_mask, :] = self.estDocTopicDist( np.array([ self.genDocWordFromArray(pw[np.newaxis, np.newaxis, :], use_errmat=True, N_factor=N_factor) for pw in pixel_word[pixel_mask, :] ])) pixel_class[pixel_mask] = np.argmax(pixel_prob[pixel_mask, :], axis=1) + 1 class_img = pixel_class.reshape(win_ysize, win_xsize) prob_img = np.dstack([ pixel_prob[:, ib].reshape(win_ysize, win_xsize) for ib in range(nbands) ]) return class_img, prob_img
def LDA(review_data, df, n_features = 10000, length = 10, n_top_words = 25, max_df = 0.01, min_df = 0.00001, n_components = 30, max_features = None, min_samples_split = None, max_depth = None, min_samples_leaf = None, myCsvRow = None): print "Start tf_vectorizer" tf_vectorizer = CountVectorizer(max_df = max_df, min_df=min_df, max_features=n_features, stop_words='english', token_pattern = r"(?u)\b[A-Za-z0-9]{3,}\b") tf = tf_vectorizer.fit_transform(review_data) tf = tf[np.array(tf.sum(1)).flatten() > length,:] tf_feature_names = tf_vectorizer.get_feature_names() test = tf[tf.shape[0]-100000:] tf = tf[:tf.shape[0]-100000] file1_name = 'TF_Vectorizer_' + 'Topic'+str(n_components) + '_Feature' + str(n_features) + '_length' + str(length) + 'max_df'+str(max_df) + 'min_df' + str(min_df) + '.pkl' joblib.dump(tf_vectorizer, file1_name) print "Finished tf_vectorizer" print "start LDA" lda = LatentDirichletAllocation(n_components=n_components, learning_method='online', verbose = 1, learning_decay=0.5, batch_size = 4096, learning_offset=64, total_samples = tf.shape[0], random_state=0, n_jobs=8) last_bound = 1000000 for it in range(8): for i, ll in enumerate(chunks(range(tf.shape[0]), 100000)): lda.partial_fit(tf[ll]) bound = lda.perplexity(test) print "preplexity:",bound if last_bound and last_bound - bound < 0.1: break last_bound = bound print_top_words(lda, tf_feature_names, n_top_words) file2_name = 'LDA_' + 'Topic'+str(n_components) + '_Feature' + str(n_features) + '_length' + str(length) + 'max_df'+str(max_df) + 'min_df' + str(min_df) + '.pkl' joblib.dump(lda, file2_name) print "Finished LDA" ######################################################################### Machine Learning part ############################################################ target_name = ['BugsCrashes','Experience','Hardware','Pricing'] data = pd.concat([pd.DataFrame(lda.transform(tf_vectorizer.transform(df.Body.tolist()))),df.BugsCrashes,df.Experience, df.Hardware, df.Pricing], 1) X = lda.transform(tf_vectorizer.transform(df.Body.tolist())) y = df[['BugsCrashes','Experience', 'Hardware', 'Pricing']] y = np.array(y) full_rf_pred = np.empty((0,4)) full_y_test = np.empty((0,4)) k_fold = KFold(data.shape[0], n_folds=10, shuffle=True, random_state=40) for fold in k_fold: train_idx = fold[0] test_idx = fold[1] X_train, y_train = X[train_idx,:], y[train_idx,:] X_test, y_test = X[test_idx, :], y[test_idx, :] rf = RandomForestClassifier(n_jobs = 8, random_state = 10, n_estimators = 300, max_features = max_features, min_samples_split = min_samples_split, max_depth = max_depth, min_samples_leaf = min_samples_leaf).fit(X_train, y_train) rf_pred = rf.predict(X_test) full_rf_pred = np.append(full_rf_pred,rf_pred, axis = 0) full_y_test = np.append(full_y_test,y_test, axis = 0) print '############rf#############\n',classification_report(full_y_test, full_rf_pred, target_names = target_name, digits = 3) with open('classification_report.csv', 'a') as csvfile: csvfile.write('\n') csvfile.write(myCsvRow) csvfile.write('\n') report = classification_report(full_y_test, full_rf_pred, target_names = target_name) classification_report_csv(report)
corpus = open('E:\\dataset2.csv').read() docs = corpus.split('\n') from sklearn.feature_extraction.text import CountVectorizer vec = CountVectorizer() matrix_X = vec.fit_transform(docs) from sklearn.decomposition import LatentDirichletAllocation lda = LatentDirichletAllocation(n_components=2, max_iter=200, learning_offset=4.0, learning_method='online') step = matrix_X.shape[0] / 10 step = int(step) index = 0 for i in range(10): if i == 9: lda.partial_fit(matrix_X[index:]) else: lda.partial_fit(matrix_X[index:index + step]) index = index + step print('\niteration ', i) print(lda.components_)
print("Handling %s LDA" % file_name) Data = json.load(open(file_name)) handle_doc_num = 0 Tmp_Total_Metrix = [] for Doc_obj in Data: Tmp_metrix = [0] * len(Vocab_cut) handle_doc_num += 1 Doc_Content = Doc_obj['main_content'].split(" ") for term in Doc_Content: if (term in Vocab_to_index): Tmp_metrix[Vocab_to_index[term]] += 1 Tmp_Total_Metrix.append(Tmp_metrix) if (handle_doc_num >= 1024): clf.partial_fit(Tmp_Total_Metrix) Tmp_Total_Metrix = [] handle_doc_num = 0 data_dir.append('test.json') for i in range(3): print("Predicting %s By LDA Model" % data_class[i]) file_name = data_dir[i] Data = json.load(open(file_name)) output_data = [] for Doc_obj in Data: Tmp_metrix = [0] * len(Vocab_cut) Doc_Content = Doc_obj['main_content'].split(" ") for term in Doc_Content: if (term in Vocab_to_index): Tmp_metrix[Vocab_to_index[term]] += 1
test_scores = [] # size: (max_iter / valid_iter) * (n_splits) train_perplexities = [] # size: (max_iter / valid_iter) * (n_splits) test_perplexities = [] # size: (max_iter / valid_iter) * (n_splits) for i in range(int(max_iter / valid_iter)): train_s = [] test_s = [] train_p = [] test_p = [] print '\ntraining ', i * valid_iter + 1, '-th iteration' for train_index, test_index in splited_index: train_data, test_data = dataset[train_index], dataset[ test_index] lda_model.partial_fit(train_data) train_s.append(lda_model.score(train_data)) test_s.append(lda_model.score(test_data)) train_p.append(lda_model.perplexity(train_data)) test_p.append(lda_model.perplexity(test_data)) train_scores.append(train_s) test_scores.append(test_s) train_perplexities.append(train_p) test_perplexities.append(test_p) print "train_scores: ", train_scores[ i], " test_scores: ", test_scores[ i], " train_perplexities: ", train_perplexities[
test_scores = [] # size: (max_iter / valid_iter) * (n_splits) train_perplexities = [] # size: (max_iter / valid_iter) * (n_splits) test_perplexities = [] # size: (max_iter / valid_iter) * (n_splits) for i in range(int(max_iter / valid_iter)): train_s = [] test_s = [] train_p = [] test_p = [] print '\ntraining ', i * valid_iter + 1, '-th iteration' for train_index, test_index in splited_index: train_data, test_data = dataset[train_index], dataset[test_index] lda_model.partial_fit(train_data) train_s.append(lda_model.score(train_data)) test_s.append(lda_model.score(test_data)) train_p.append(lda_model.perplexity(train_data)) test_p.append(lda_model.perplexity(test_data)) train_scores.append(train_s) test_scores.append(test_s) train_perplexities.append(train_p) test_perplexities.append(test_p) print "train_scores: ", train_scores[i], " test_scores: ", test_scores[i], " train_perplexities: ", train_perplexities[i], " test_perplexities: ", test_perplexities[i]
N_CLASSES = np.unique(y_train) scores_train = [] scores_test = [] # EPOCH epoch = 0 while epoch < N_EPOCHS: print('epoch: ', epoch) # SHUFFLING random_perm = np.random.permutation(X_train.shape[0]) mini_batch_index = 0 while True: # MINI-BATCH indices = random_perm[mini_batch_index:mini_batch_index + N_BATCH] clf.partial_fit(X_train[indices], y_train[indices], classes=N_CLASSES) mini_batch_index += N_BATCH if mini_batch_index >= N_TRAIN_SAMPLES: break # SCORE TRAIN scores_train.append(clf.score(X_train, y_train)) # SCORE TEST scores_test.append(clf.score(X_test, y_test)) epoch += 1 plt.figure() plt.plot(scores_train, color='b', alpha=0.8, label='Train') plt.plot(scores_test, color='r', alpha=0.8, label='Test')