Пример #1
0
 def applyNMF(self, number_of_clusters, country_specific_tweets):
     train, feature_names = self.extractFeatures(country_specific_tweets,False)
     name = "nmf"
     
     # Fit the NMF model
     if self.results:
         print("Fitting the NMF model", end=" - ")
     
     t0 = time()
     nmf = NMF(n_components=number_of_clusters, random_state=1, alpha=.1, l1_ratio=.5).fit(train)
     
     if self.results:
         print("done in %0.3fs." % (time() - t0))
     
     if self.results:
         print("\nNMF:")
     
     parameters = nmf.get_params()
     
     if self.results:
         print("Parameter: " + str(parameters))
     topics = nmf.components_
     doc_topic = nmf.transform(train)
     top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
     labels = numpy.asarray(labels)
     
     if self.results:
         print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
                
     return name, parameters, top10, labels
Пример #2
0
def tfidf_nmf(release_texts, n_components=10, max_features=None):
    '''
        Creates and fits tfidf and NMF models.

        INPUT:
        - n_components: number of latent features for the NMF model to find
        - max_features: max number of features (vocabulary size) for the tfidf model to consider

        OUTPUT:
        - tfidf_vectorizer: tfidf model object
        - tfidf_sparse:tfidf sparse matrix
        - nmf: NMF model object
        - W: Feature matrix output from NMF factorization into W and H matrices
    '''
    # tfidf model
    custom_stop_words = make_stop_words()
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, max_features=max_features)
    tfidf_sparse = tfidf_vectorizer.fit_transform(release_texts)

    # normalize row-wise so each row sums to one
    tfidf_sparse = normalize(tfidf_sparse, axis=1, norm='l1')

    # nmf model
    nmf = NMF(n_components=n_components, random_state=1)
    nmf.fit(tfidf_sparse)
    W = nmf.transform(tfidf_sparse)
    return tfidf_vectorizer, tfidf_sparse, nmf, W
Пример #3
0
def hog2hognmf(hog_feature):
    """Transform HOG feature into HOG-NMF feature.

    Parameters
    ----------
    hog_feature: np.ndarray
      HOG feature.
    """
    mat = np.zeros((500, 8), dtype=np.float32)
    NMFmodel = NMF(n_components=2, init="random", random_state=0)
    # Transform 3780 into 500 * 8
    for i in range(7):
        mat[:, i] = hog_feature[i * 500 : (i + 1) * 500]
    mat[:280, 7] = hog_feature[3500:]
    W = NMFmodel.fit_transform(mat)
    H = NMFmodel.components_
    hognmf_feature = np.array([], dtype=np.float32)
    for i in range(8):
        _sum = np.sum(H[:, i])
        if _sum == 0:
            H[:, i] *= 0.0
        else:
            H[:, i] /= _sum
        hognmf_feature = np.append(hognmf_feature, H[:, i])
    for i in range(500):
        _sum = np.sum(W[i, :])
        if _sum == 0:
            W[i, :] *= 0.0
        else:
            W[i, :] /= _sum
        hognmf_feature = np.append(hognmf_feature, W[i, :])
    return hognmf_feature
Пример #4
0
    def nmf(self, **kwargs):
        """Perform dimensionality reduction using NMF."""
        nmf = NMF(**kwargs)

        reduced_matrix = nmf.fit_transform(self.matrix)
        # TODO: it is incorrect to pass self.column_labels! There are not column labels.
        return Space(reduced_matrix, self.row_labels, self.column_labels)
Пример #5
0
def nmf_df(sym, k, coll):
    data = [ item for item in coll.find({'text': { '$in' :[re.compile(sym)] }}) ]
    sents = [ sentence['text'] for sentence in data ]
    dates = [ str(text['created_at']) for text in data ]
    d = np.array(dates).T
    d = d.reshape(len(dates), 1)

    vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
    X = vectorizer.fit_transform(sents)
    #features = vectorizer.get_feature_names()

    model = NMF(n_components=k, init='random', random_state=0)
    latent_features = model.fit_transform(X)

    # lat0 = list(latent_features[:,0])
    # lat1 = list(latent_features[:,1])
    # lat2 = list(latent_features[:,2])
    # lat3 = list(latent_features[:,3])

    df = pd.DataFrame(latent_features)   #np.concatenate((d, latent_features), axis=1)
    df.columns = [ 'lat'+ str(n) for n in xrange(len(df.columns)) ]
    df['time_stamp'] = d
    #print df.head()

    df['date'] = pd.to_datetime(df['time_stamp']).apply(pd.datetools.normalize_date)
    df.pop('time_stamp')
    #print df.head()
    grouped_data = df.groupby(['date']).mean()
    grouped_data['sym'] = sym

    return grouped_data
Пример #6
0
def test_nmf_fit_close(solver):
    rng = np.random.mtrand.RandomState(42)
    # Test that the fit is not too far away
    pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0,
               max_iter=600)
    X = np.abs(rng.randn(6, 5))
    assert_less(pnmf.fit(X).reconstruction_err_, 0.1)
Пример #7
0
def find_aspects(sentences, city, n_top_words=15):
    '''
    INPUT sentences, city(str, lower case)
    OUTPUT aspects dictionary
    '''
    vectorizer = TfidfVectorizer(max_features=n_features, stop_words='english')
    document_term_mat = vectorizer.fit_transform(sentences)
    feature_words = vectorizer.get_feature_names()

    nmf = NMF(n_components=n_topics)
    W_sklearn = nmf.fit_transform(document_term_mat)
    H_sklearn = nmf.components_
    important_words = []

    for topic in H_sklearn:
        for i in topic.argsort()[:-n_top_words - 1:-1]:
                important_words.append(feature_words[i])
    important_words = set(important_words)
    important_words = list(important_words)

    nouns = []
    for i in sentences: nouns.extend(list(TextBlob(i).noun_phrases))
    noun_list = list(set(filter(lambda x: (len(x.split(' '))>1)&('...' not in x.split(' ')), nouns)))
    aspects_dict = defaultdict(list)

    for i in important_words:
        if i not in [city, city.lower(),'okay','ok','thing','things','time','times','greasy','awful'] and TextBlob(i).tags[0][1] in ['NN', 'NNS']:
            for j in noun_list:
                if i in j.split(' '):
                    aspects_dict[i].append(j)
    for i in aspects_dict: aspects_dict[i] = list(set(aspects_dict[i]))

    return aspects_dict
def extract_reconstruction_errors(comps, music_stft, window_length, hop):
	K = comps.shape[1]
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	start = 0
	errors = []

	while (start + window_length < music_stft.shape[1]):
		block = music_stft[:, start:start+window_length]
		
		H = np.random.rand(K, block.shape[1])
		W[:, 0:K] = comps
		
		params = {'W': W, 'H': H, 'update_W': False}
		comps_block = transformer.fit_transform(np.abs(block), **params)
		acts_block = transformer.components_
	
		#reconstruct the signal
		block_reconstruction = comps_block.dot(acts_block)
		errors.append(transformer.reconstruction_err_)

		start = start + hop
	return errors
Пример #9
0
def produceEncoding( trainX, nComponents ):
    '''Produces an NMF encoding from the training
    data matrix'''
    model = NMF( n_components=nComponents, solver='cd', \
                tol=1e-4, max_iter=200, alpha=0.0 )
    model.fit( trainX )
    return model
Пример #10
0
def fit_nmf(tfidf):
    '''takes in a tfidf sparse vector and finds the top topics'''
    nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5)
    nmf.fit(tfidf)
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    nmf_topic_dict = print_top_words(nmf, tfidf_feature_names, n_top_words)
    return nmf, nmf_topic_dict
def extract_reconstruction_error_beats(comps, music_stft, beats):
	K = comps.shape[1]
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	start = 0
	errors = []
	lookback = 0
	weight = np.array([1 for i in range(2, music_stft.shape[0] + 2)])
	weight = weight/np.max(weight)
	for i in range(lookback+1, len(beats)):
		block = music_stft[:, beats[i-(lookback+1)]:beats[i]]
		
		H = np.random.rand(K, block.shape[1])
		W[:, 0:K] = comps
		
		params = {'W': W, 'H': H, 'update_W': False}
		comps_block = transformer.fit_transform(np.abs(block), **params)
		acts_block = transformer.components_

		#reconstruct the signal
		block_reconstruction = comps_block.dot(acts_block)
		
		block_reconstruction = block_reconstruction.T*weight
		block = block.T*weight
		distance = norm(block_reconstruction - np.abs(block))
		#errors.append(transformer.reconstruction_err_)
		errors.append(distance)
	return errors
def extract_template(comps, music_stft):
	K = comps.shape[1]
	
	#initialize transformer (non-negative matrix factorization) with K components
	transformer = NMF(n_components = K, init = 'custom')
	
	#W and H are random at first
	W = np.random.rand(comps.shape[0], K)
	H = np.random.rand(K, music_stft.shape[1])
	
	#set W to be the template components you want to extract
	W[:, 0:K] = comps

	#don't let W get updated in the non-negative matrix factorization
	params = {'W': W, 'H': H, 'update_W': False}
	comps_music = transformer.fit_transform(np.abs(music_stft), **params)
	acts_music = transformer.components_
	
	#reconstruct the signal
	music_reconstruction = comps_music.dot(acts_music)

	#mask the input signal
	music_stft_max = np.maximum(music_reconstruction, np.abs(music_stft))
	mask = np.divide(music_reconstruction, music_stft_max)
	mask = np.nan_to_num(mask)
	
	#binary mask
	mask = np.round(mask)

	#template - extracted template, residual - everything that's leftover.
	template = np.multiply(music_stft, mask)
	residual = np.multiply(music_stft, 1 - mask)

	return template, residual
def doNMF(datan,n_components=4):
    # from Mitsu
    #alternatively PCA ... might me faster
    nmf=NMF(n_components=n_components,init='nndsvd')
    data_decomp_all=nmf.fit_transform(datan)
    data_components_all=nmf.components_
    return data_decomp_all,data_components_all
Пример #14
0
    def _make_test_matrix(self, matrix, test_decomp='svd'):
        '''
        Input: a matrix
        Output: a recomposed estimated ratings matrix

        Decomposes input matrix according to decomposition type
        and then makes an estimated ratings matrix
        '''
        if test_decomp == 'svd':
            _, s1, V = svd(matrix)
            how = self.s_option
            how = self.test_how
            #print "s1", s1
            #print "how", how
            s = self._get_s(s1, how)
            #print s
            #print V
            #print self.matrix_1.U
            return np.dot(self.matrix_1.U, np.dot(s, V))
        elif test_decomp == 'nmf':
            model = NMF()
            H = model.fit_transform(matrix)
            print H
            W = model.components_
            return np.dot(self.matrix_1.H, W)
        else:
            pass

        '''
Пример #15
0
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    m = NMF(n_components=4, init="nndsvd", random_state=0)
    ft = m.fit_transform(A)
    t = m.transform(A)
    assert_array_almost_equal(ft, t, decimal=2)
Пример #16
0
	def __Factorize_NMF(self,K):
		model = NMF(n_components=K,max_iter=self._iteration)
		model.fit(self._mat)
		user_fmat = model.fit_transform(self._mat)
		item_fmat = model.components_.T

		return user_fmat,item_fmat
Пример #17
0
def do_NMF(sparse_matrix):
  t0 = time.time()
  print("* Performing NMF on sparse matrix ... ")
  nmf = NMF(n_components=3)
  coordinates = nmf.fit_transform(sparse_matrix)
  print("done in %0.3fs." % (time.time() - t0))
  return(coordinates)
def nnMatrixFactorisation(data, labels, new_dimension):
    print "non negative matrix factorisation..."
    start = time.time()
    mf = NMF(n_components=new_dimension)
    reduced = mf.fit_transform(data)
    end = time.time()
    return (reduced, end-start)
Пример #19
0
def find_template(music_stft, sr, min_t, n_components, start, end):
    """
    from Prem
    :param music_stft:
    :param sr:
    :param min_t:
    :param n_components:
    :param start:
    :param end:
    :return:
    """
    template_stft = music_stft[:, start:end]
    layer = librosa.istft(template_stft)
    layer_rms = np.sqrt(np.mean(layer * layer))

    comps = []
    acts = []
    errors = []

    for T in range(min_t, n_components):
        transformer = NMF(n_components=T)
        comps.append(transformer.fit_transform(np.abs(template_stft)))
        acts.append(transformer.components_)
        errors.append(transformer.reconstruction_err_)

    # knee = np.diff(errors, 2)
    # knee = knee.argmax() + 2
    knee = 0

    # print 'Using %d components' % (knee + min_t)
    return comps[knee], acts[knee]
Пример #20
0
def test_nmf_fit_nn_output():
    # Test that the decomposition does not contain negative values
    A = np.c_[5 * np.ones(5) - np.arange(1, 6), 5 * np.ones(5) + np.arange(1, 6)]
    for init in (None, "nndsvd", "nndsvda", "nndsvdar"):
        model = NMF(n_components=2, init=init, random_state=0)
        transf = model.fit_transform(A)
        assert_false((model.components_ < 0).any() or (transf < 0).any())
Пример #21
0
def nmf_model2(n_topics,document_term_mat):
    # print("\n\n---------\n decomposition")
    nmf = NMF(n_components=n_topics, l1_ratio=0.0)
    W_sklearn = nmf.fit_transform(document_term_mat)
    H_sklearn = nmf.components_
    # describe_nmf_results(document_term_mat, W_sklearn, H_sklearn)
    return W_sklearn, H_sklearn
 def infer_topics(self, num_topics=10):
     self.nb_topics = num_topics
     nmf = NMF(n_components=num_topics)
     topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space)
     self.topic_word_matrix = []
     self.document_topic_matrix = []
     vocabulary_size = len(self.corpus.vocabulary)
     row = []
     col = []
     data = []
     for (topic_idx, topic) in enumerate(nmf.components_):
         for i in range(vocabulary_size):
             row.append(topic_idx)
             col.append(i)
             data.append(topic[i])
     self.topic_word_matrix = coo_matrix((data, (row, col)),
                                         shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr()
     row = []
     col = []
     data = []
     doc_count = 0
     for doc in topic_document:
         topic_count = 0
         for topic_weight in doc:
             row.append(doc_count)
             col.append(topic_count)
             data.append(topic_weight)
             topic_count += 1
         doc_count += 1
     self.document_topic_matrix = coo_matrix((data, (row, col)),
                                             shape=(self.corpus.size, self.nb_topics)).tocsr()
def reduceDimensionality(n_components=100):
	# import the csv into a pandas df
	df = pd.read_csv('data/gameData.csv')

	# Normalize the numeric columns to values in [0,1]
	numericColumns = ['maxPlayers','maxPlaytime','minAge','minPlayers','minPlaytime','playtime']
	colsToNormalize = []
	for col in numericColumns:
		if col in df.columns:
			colsToNormalize.append(col)

	df[colsToNormalize] = df[colsToNormalize].apply(lambda x: (x - x.min())/(x.max() - x.min())/2)

	# Drop string columns
	colsToDrop = ['artists','categories','designers','families','publishers','mechanics','boardGameId','yearPublished']

	# Convert df to an array for NMF and stor the board game id column to attach later
	boardGameIds = df['boardGameId']
	arr = df.as_matrix([col for col in df.columns if col not in colsToDrop])
	arr = np.nan_to_num(arr)

	# Perform NMF with n_dimensions
	model = NMF(n_components=n_components)
	W = model.fit_transform(arr)
	W = np.insert(W, 0, boardGameIds, axis=1)

	np.savetxt("data/reducedGameFeatures.csv", W, delimiter=",")
Пример #24
0
def extractTemplate(y, w=d_w, h=d_h, n_components=nc):
    model = NMF(n_components=n_components, max_iter=max_iter, beta=beta)
    S = librosa.core.stft(y, n_fft=w, hop_length=h)
    model.fit_transform(np.abs(S).T)
    components = model.components_.T
    #components, activation = librosa.decompose.decompose(np.abs(S), n_components=3)
    return components
def get_topics_nmf(urls, num_topics):
    '''Input: URL containing links to each document (pdf) in the
    corpus (i.e. arxiv)  Output: the num_topics most important latent
    topics from the corpus (via NMF)
    '''
    article_info = []
    for url in urls:
        article_info.append(get_text(url))

    text = []
    for thing in article_info:
        text.extend(thing[0])
    text = clean_pdf_text(text)

    tfidf_math = TfidfVectorizer(max_features=100, stop_words=math_stop(),
                                 ngram_range=(1, 1), decode_error='ignore')
    M = tfidf_math.fit_transform(text)

    feature_names = tfidf_math.get_feature_names()
    feature_names = [WordNetLemmatizer().lemmatize(word)
                     for word in feature_names]
    nmf = NMF(n_components=num_topics)
    nmf.fit(M)
    topics = []
    for topic_idx, topic in enumerate(nmf.components_):
        topics.append((" ".join([feature_names[i] for i in
                                topic.argsort()[:-10 - 1:-1]])))
    return M, topics, text, title_list, urls
Пример #26
0
 def extract_tfidf_nmf_feats(self, df_data, n_components):
     """
     Extract tfidf features using nmf.     
     """        
     df_feat = pd.DataFrame(index=range(df_data.shape[0]))
     tfidf = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')
     tsvd = TruncatedSVD(n_components=n_components, random_state = 2016)
     nmf = NMF(solver='cd', n_components=n_components, init='nndsvda',
                 random_state=0, tol=1e-3)
     df_data['q'].to_csv('q', index=False)
     df_data['t'].to_csv('t', index=False)
     df_data['d'].to_csv('d', index=False)
     print('fitting in tfidf')
     tfidf.set_params(input='filename')        
     tfidf.fit(['q','t','d'])
     tfidf.set_params(input='content')  
     for col in ['d', 't', 'q', 'b']:
         print('process column', col)
         txt = df_data[col]
         tfidf_mat = tfidf.transform(txt)
         nd_feat = nmf.fit_transform(tfidf_mat)
         tmp = pd.DataFrame(nd_feat, columns=[col+'_tfidf_nmf_comp'+str(i) \
                                     for i in range(n_components)])
         df_feat = pd.merge(df_feat, tmp, left_index=True, right_index=True)
     saveit(df_feat, 'df_tfidf_nmf_feats')
Пример #27
0
def get_LDA(X, num_components=10, show_topics=True):
	""" Latent Dirichlet Allication by NMF.
	21 Nov 2015, Keunwoo Choi

	LDA for a song-tag matrix. The motivation is same as get_LSI. 
	With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix,
	where X ~= X' = W*H as a result of NMF. 
	It is also good to have non-negative elements, straight-forward for both W and H.

	"""

	from sklearn.decomposition import NMF
	if X == None:
		print 'X is omitted, so just assume it is the mood tag mtx w audio.'
		X = np.load(PATH_DATA + FILE_DICT["mood_tags_matrix"]) #np matrix, 9320-by-100

	nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt.
	W = nmf.fit_transform(X)
	H = nmf.components_
	print '='*60
	print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1]))

	term_rankings = []
	moodnames = cP.load(open(PATH_DATA + FILE_DICT["moodnames"], 'r')) #list, 100
	for topic_index in range( H.shape[0] ):
		top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]
		term_ranking = [moodnames[i] for i in top_indices]
		term_rankings.append(term_ranking)
		if show_topics:	
			print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) )
	print '='*60
	cP.dump(term_rankings, open(PATH_DATA + (FILE_DICT["mood_topics_strings"] % num_components), 'w'))
	return W / np.max(W) # return normalised matrix, [0, 1]
Пример #28
0
def get_LDA(X, num_components=10, show_topics=True):
	''' Latent Dirichlet Allication by NMF.
	21 Nov 2015, Keunwoo Choi

	LDA for a song-tag matrix. The motivation is same as get_LSI. 
	With NMF, it is easier to explain what each topic represent - by inspecting 'H' matrix,
	where X ~= X' = W*H as a result of NMF. 
	It is also good to have non-negative elements, straight-forward for both W and H.

	'''

	from sklearn.decomposition import NMF
	
	nmf = NMF(init='nndsvd', n_components=num_components, max_iter=400) # 400 is too large, but it doesn't hurt.
	W = nmf.fit_transform(X)
	H = nmf.components_
	print '='*60
	print "NMF done with k=%d, average error:%2.4f" % (num_components, nmf.reconstruction_err_/(X.shape[0]*X.shape[1]))

	term_rankings = []
	moodnames = cP.load(open(PATH_DATA + FILE_DICT['sorted_tags'], 'r')) #list, 100
	for topic_index in range( H.shape[0] ):
		top_indices = np.argsort( H[topic_index,:] )[::-1][0:10]
		term_ranking = [moodnames[i] for i in top_indices]
		term_rankings.append(term_ranking)
		if show_topics:	
			print "Topic %d: %s" % ( topic_index, ", ".join( term_ranking ) )
	print '='*60
	cP.dump(nmf, open(PATH_DATA + 'NMF_object.cP', 'w'))
	cP.dump(term_rankings, open(PATH_DATA + ('topics_strings_%d_components.cP' % num_components), 'w'))
	for row_idx, row in enumerate(W):
		if np.max(row) != 0:
			W[row_idx] = row / np.max(row)
	return W / np.max(W) # return normalised matrix, [0, 1]
	''''''
Пример #29
0
class NMFReducer():

    def __init__(self, dataset, dataset_name, num_components=10):
        self.dataset = dataset
        self.dataset_name = dataset_name
        self.labels = dataset.target
        self.scaler = MinMaxScaler()
        self.data = self.scaler.fit_transform(dataset.data)
        self.n_samples, self.n_features = self.data.shape

        self.reducer = NMF(n_components=num_components, max_iter=5000)

    def reduce(self):
        self.reducer.fit(self.data)
        self.reduced = self.scaler.fit_transform(self.reducer.transform(self.data))
        return self.reduced

    def benchmark(self, estimator, name, data):
        t0 = time()
        sample_size = 300
        labels = self.labels

        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f'
              % (name, (time() - t0), estimator.inertia_,
                 metrics.homogeneity_score(labels, estimator.labels_),
                 metrics.completeness_score(labels, estimator.labels_),
                 metrics.v_measure_score(labels, estimator.labels_),
                 metrics.adjusted_rand_score(labels, estimator.labels_),
                 metrics.adjusted_mutual_info_score(labels,  estimator.labels_),
                 metrics.silhouette_score(data, estimator.labels_,
                                          metric='euclidean',
                                          sample_size=sample_size)))

    def display_reduced_digits(self):
        sys.stdout = open('out/NMFReduceDigitsOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def display_reduced_iris(self):
        sys.stdout = open('out/NMFReduceIrisOutput.txt', 'w')
        print("NMF Reduction of %s:\n" % self.dataset_name)
        print(40 * '-')
        print(self.reduced)
        print("\nLength of 1 input vector before reduction: %d \n" % len(self.data.tolist()[0]))
        print("Length of 1 input vector after reduction: %d \n" % len(self.reduced.tolist()[0]))
        print(40 * '-')
        print(self.reducer.reconstruction_err_)

    def reduce_crossvalidation_set(self, X_train, X_test):
        self.reducer.fit(X_train)
        reduced_X_train = self.scaler.transform(X_train)
        reduced_X_test = self.scaler.transform(X_test)
        return reduced_X_train, reduced_X_test
Пример #30
0
def test_nmf_transform():
    # Test that NMF.transform returns close values
    A = np.abs(random_state.randn(6, 5))
    for solver in ('pg', 'cd'):
        m = NMF(solver=solver, n_components=4, init='nndsvd', random_state=0)
        ft = m.fit_transform(A)
        t = m.transform(A)
        assert_array_almost_equal(ft, t, decimal=2)
Пример #31
0
# Use tf (raw term count) features for LDA.
print("抽取 tf 特征,用于LDA")
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("抽取 tf 特征完成 in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("用tf-idf特征训练NMF模型(范数),, "
      "文章个数=%d and 特征个数=%d..." % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("训练完成。done in %0.3fs." % (time() - t0))

print("\n在非负的矩阵分解模型(范数)的主题:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("用ft-idf特征训练非负的矩阵分解模型(普通的KL散度), 文章个数=%d and 特征个数=%d..." %
      (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components,
          random_state=1,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
Пример #32
0
start_time = time.time()
# vectorize documents by using tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenization,
                                   max_features=n_features,
                                   max_df=0.9,
                                   min_df=2)
docs_tfidf = tfidf_vectorizer.fit_transform(doc_set)
termid_word_list = tfidf_vectorizer.get_feature_names(
)  # word = termid_word_list[indx]

print("Fitting the NMF model...")
# solver: coordinate descent; learning rate: alpha = 0.1;
#l1_ratio 0: L2 regularization, NO L1 regularization
nmf_model = NMF(n_components=n_factors,
                random_state=1,
                solver='cd',
                alpha=.1,
                l1_ratio=.0)
# generate latent factors for documents based on NMF model
docs_lf = nmf_model.fit_transform(docs_tfidf)

for qIndex in range(0, len(queryID_list)):
    #for qIndex in range(0, 2):
    print(str(qIndex) + "/" + str(len(queryID_list)))
    query_str = queries_dict[queryID_list[qIndex]]
    query = [query_str]
    # generate tfidf vector for the query
    query_tfidf = tfidf_vectorizer.transform(query)
    # generate latent factor for the query based on NMF model
    query_lf = nmf_model.transform(query_tfidf)
        # tf-idf
        for max_fq in df_gradients:
            tweetImport = codecs.open(importfilename, 'r', 'utf-8')
            # NMF can use tf-idf # lowercase=False
            tfidf_vectorizer = TfidfVectorizer(strip_accents='ascii', ngram_range=(ngram_min, ngram_max), max_df=max_fq, min_df=1, max_features=num_features, stop_words=stop_words, analyzer='word', token_pattern='[a-zA-Z]+')
            tfidf_matrix = tfidf_vectorizer.fit_transform(tweetImport)
            tfidf_feature_names = tfidf_vectorizer.get_feature_names()
            stop_words.extend(tfidf_feature_names)
            tweetImport.close()

            # save the terms ranked by tfidf scores into a list, to be used for wordcloud plotting
            version = 3
            saveTerms_sortedTFIDFscores(outputPath, max_fq, num_features, version, tfidf_feature_names, tfidf_matrix)

            # Run NMF (results not as good as LDA)
            nmf = NMF(n_components=num_topics, random_state=1, alpha=0, init='random').fit(tfidf_matrix)
            display_topics(nmf, tfidf_feature_names, num_top_words)


        # plot all wordclouds in one figure
        # wordcloud_in_one_figure(outputPath, num_features, df_gradients)

        # plot individual wordclouds:
        plt.rcParams['figure.figsize'] = (10.0, 7.0)
        
        for max_fq in df_gradients:
            tfidffilename = outputPath + 'tweet_keyword_tradewar_tfidf_features_' + str(max_fq) + '_' + str(num_features) + '_v3.csv'
            tfidffile = open(tfidffilename, 'r')
            word_text = tfidffile.read()

            wordcloud = WordCloud(colormap='hsv', max_words=1000, width=3000, height=2000, margin=3, collocations=False).generate(word_text)
Пример #34
0
def topics(df, model="lda", stopwords=None):
    """ Either executes LDA or NMF on a dutch document.
    This is a simple implementation and only used for
    "fun" purposes. It is not so much to find the very
    best topics, but topics that are good enough. 
    
    
    Parameters:
    -----------
    df : pandas dataframe
        Pandas dataframe that contains the raw messages
    mode : str, default "lda"
        Which model to use for topic modelling. 
        Either "lda" or "nmf" works for now
    stopwords : str, default None
        If you want to remove stopwords, provide a local 
        link to the text file (that includes a list of words)
        including the extension. 
    
    """
    # Prepare stopwords
    if stopwords:
        with open(stopwords) as stopwords_list:
            stopwords_list = stopwords_list.readlines()
            stopwords_list = [word[:-1] for word in stopwords_list]
    else:
        stopwords_list = []

    # Create Topics
    for user in df.User.unique():
        print("#" * len(user) + "########")
        print("### " + user + " ###")
        print("#" * len(user) + "########\n")

        data_samples = df[df.User == user].Message_Prepared
        data_samples = data_samples.tolist()

        if model == "lda":
            # Extracting Features
            tf_vectorizer = CountVectorizer(max_df=0.95,
                                            min_df=2,
                                            stop_words=stopwords_list)
            tf = tf_vectorizer.fit_transform(data_samples)

            # Fitting LDA
            topic_model = LatentDirichletAllocation(n_components=5,
                                                    max_iter=5,
                                                    learning_method='online',
                                                    learning_offset=50.,
                                                    random_state=0)
            topic_model.fit(tf)
            feature_names = tf_vectorizer.get_feature_names()
        else:
            # MNF uses tfidf
            tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                               min_df=2,
                                               stop_words=stopwords_list)
            tfidf = tfidf_vectorizer.fit_transform(data_samples)
            feature_names = tfidf_vectorizer.get_feature_names()

            # Run NMF
            topic_model = NMF(n_components=5,
                              random_state=1,
                              alpha=.1,
                              l1_ratio=.5,
                              init='nndsvd')
            topic_model.fit(tfidf)

        print("\nTopics in {} model:".format(model))
        print_top_words(topic_model, feature_names, 7)
Пример #35
0
def update_nmf_graph1(no_topics, nmf_components_value, nmf_alpha_value, nmf_l1ratio_value, min_df_value, max_df_value, ngram_range_value, num_clicks):


	if num_clicks > 0:

		# Getting the filenames
		matrix_filename = 'temp_data/' + temporary_key + '_output_matrix.csv'
		processed_docs_filename = 'temp_data/' + temporary_key + '_processed_docs.csv'
		features_list_filename = 'temp_data/' + temporary_key + '_features_list.csv'
		tfidf_fit_filename = 'temp_data/' + temporary_key + '_vectorizer_model.pickle'

		print('loading nmf input objects')
		# Read in tfidf

		dense_tfidf_matrix = pd.read_csv(matrix_filename)
		print('The shape of the tfidf_matrix is: {}.'.format(dense_tfidf_matrix.shape))

		# Reading in the processed documents
		processed_docs = pd.read_csv(processed_docs_filename, encoding = 'latin1')
		processed_docs = processed_docs['processed_doc'].tolist() 

		print(processed_docs[0])


		features_df = pd.read_csv(features_list_filename)
		features_list = features_df['feature_list'].tolist()
		print('The first five token features are: {}.'.format(features_list[:5]))


		sparse_tfidf_matrix = scipy.sparse.csr_matrix(dense_tfidf_matrix.values)

		# print(sparse_tfidf_matrix)
		print('the sparse tfidf matrix is loaded')

		# Defining the NMF object
		nmf = NMF(n_components=no_topics, random_state=42, alpha=0.1, l1_ratio=.2, \
          max_iter = 500, verbose = False, shuffle = True, init='nndsvd', solver = 'cd')


		print('Computing the NMF for the sparse tfidf matrix')
		nmf_model = nmf.fit(sparse_tfidf_matrix)


		print(nmf_model)
		#--------------------------------------------------------------------------------------------------
		#--------------------------------------------------------------------------------------------------
		def generate_topic_table(model, feature_names, n_top_words):
		    topics = {}
		    for topic_idx, topic in enumerate(model.components_):
		        t = ("topic_%d" % topic_idx)
		        topics[t] = [feature_names[i] for i in top_words(topic, n_top_words)]
		        
		    out_df = pd.DataFrame(topics)
		    out_df = out_df[list(topics.keys())]
		    
		    return out_df
		#--------------------------------------------------------------------------------------------------
		#--------------------------------------------------------------------------------------------------



		print(processed_docs[0])
Пример #36
0
def log_stdvar_NMF_L2(X):
    X = log_stdvar(X)
    k = compute_pcs_needed_to_explain_variance(X,50)
    nmf = NMF(n_components=k)
    Xrd = nmf.fit_transform(X)
    return pairwise_distances(Xrd)

# 1 パイプラインの定義 ------------------------------------------------------------------------------

# パイプラインの定義
# --- 次元削減
# --- SVM分類器
pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', SVC())
])

# パラメータ設定
params_grid = [
    {
        'reduce_dim': [PCA(), NMF(), Isomap(), TruncatedSVD()],
        'reduce_dim__n_components': [2, 3],
        'classify': [SVC(), LinearSVC()],
        'classify__C': [1, 10, 100, 1000]
    }
]

# 確認
print(params_grid)


# 2 パラメータチューニングの実行 -----------------------------------------------------------------------

# <ポイント>
# - グリッドサーチを用いてハイパーパラメータのチューニングを行う
Пример #38
0
def extract_components(mov_tot,
                       n_components=6,
                       normalize_std=True,
                       max_iter_DL=-30,
                       method_factorization='nmf',
                       **kwargs):
    """
    From optical flow images can extract spatial and temporal components

    Parameters:
    ----------
    mov_tot: ndarray (can be 3 or 4D)
        contains the optical flow values, either in cartesian or polar, either one (3D) or both (4D coordinates)
        the input is generated by the compute_optical_flow function    

    n_components: int
        number of components to look for

    normalize_std: bool
        whether to normalize each oof the optical flow components

    normalize_output_traces: boolean
        whether to normalize the behavioral traces so that they match the units in the movie

    Returns:
    -------
    spatial_filter: ndarray
        set of spatial inferred filters     

    time_trace:ndarray
        set of time components

    norm_fact: ndarray
        used notmalization factors

    """

    if mov_tot.ndim == 4:
        if normalize_std:
            norm_fact = np.nanstd(mov_tot, axis=(1, 2, 3))
            mov_tot = old_div(mov_tot, norm_fact[:, np.newaxis, np.newaxis,
                                                 np.newaxis])
        else:
            norm_fact = np.array([1., 1.])
        c, T, d1, d2 = np.shape(mov_tot)

    else:
        norm_fact = 1
        T, d1, d2 = np.shape(mov_tot)
        c = 1

    tt = time.time()
    newm = np.reshape(mov_tot, (c * T, d1 * d2))

    if method_factorization == 'nmf':
        nmf = NMF(n_components=n_components, **kwargs)

        time_trace = nmf.fit_transform(newm)
        spatial_filter = nmf.components_
        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :] for sp in spatial_filter
        ],
                                        axis=0)

    elif method_factorization == 'dict_learn':

        import spams
        newm = np.asfortranarray(newm, dtype=np.float32)
        time_trace = spams.trainDL(newm,
                                   K=n_components,
                                   mode=0,
                                   lambda1=1,
                                   posAlpha=True,
                                   iter=max_iter_DL)

        spatial_filter = spams.lasso(newm,
                                     D=time_trace,
                                     return_reg_path=False,
                                     lambda1=0.01,
                                     mode=spams.spams_wrap.PENALTY,
                                     pos=True)

        spatial_filter = np.concatenate([
            np.reshape(sp, (d1, d2))[np.newaxis, :, :]
            for sp in spatial_filter.toarray()
        ],
                                        axis=0)

    time_trace = [np.reshape(ttr, (c, T)).T for ttr in time_trace.T]

    el_t = time.time() - tt
    print(el_t)
    return spatial_filter, time_trace, norm_fact
# Import NMF
from sklearn.decomposition import NMF

# Create an NMF instance: model
model = NMF(n_components=6)

# Fit the model to articles
model.fit(articles)

# Transform the articles: nmf_features
nmf_features = model.transform(articles)

# Print the NMF features
print(nmf_features)

#NMF features of the Wikipedia articles
# Also available is a list titles giving the title of each Wikipedia article.

# Import pandas
import pandas as pd

# Create a pandas DataFrame: df
df = pd.DataFrame(nmf_features, index=titles)

# Print the row for 'Anne Hathaway'
print(df.loc['Anne Hathaway'])

# Print the row for 'Denzel Washington'
print(df.loc['Denzel Washington'])

#When investigating the features, notice that for both actors, the NMF feature 3 has by far the highest value. This means that both articles are reconstructed using mainly the 3rd NMF component
Пример #40
0
def test_nmf_fit_close(solver):
    rng = np.random.mtrand.RandomState(42)
    # Test that the fit is not too far away
    pnmf = NMF(5, solver=solver, init='nndsvdar', random_state=0, max_iter=600)
    X = np.abs(rng.randn(6, 5))
    assert pnmf.fit(X).reconstruction_err_ < 0.1
Пример #41
0
def test_n_components_greater_n_features():
    # Smoke test for the case of more components than features.
    rng = np.random.mtrand.RandomState(42)
    A = np.abs(rng.randn(30, 10))
    NMF(n_components=15, random_state=0, tol=1e-2).fit(A)
Пример #42
0
# Rm*n m = item n = user
RATE_MATRIX = np.array(
    [[5, 5, 3, 0, 5, 5, 4, 3, 2, 1, 4, 1, 3, 4, 5],
     [5, 0, 4, 0, 4, 4, 3, 2, 1, 2, 4, 4, 3, 4, 0],
     [0, 3, 0, 5, 4, 5, 0, 4, 4, 5, 3, 0, 0, 0, 0],
     [5, 4, 3, 3, 5, 5, 0, 1, 1, 3, 4, 5, 0, 2, 4],
     [5, 4, 3, 3, 5, 5, 3, 3, 3, 4, 5, 0, 5, 2, 4],
     [5, 4, 2, 2, 0, 5, 3, 3, 3, 4, 4, 4, 5, 2, 5],
     [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0],
     [5, 4, 3, 3, 2, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1],
     [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2],
     [5, 4, 3, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1]]
)

nmf_model = NMF(n_components=2) # 设有2个主题
item_dis = nmf_model.fit_transform(RATE_MATRIX)
user_dis = nmf_model.components_

print('用户的主题分布:' + str(user_dis.shape))
print(user_dis)
print('电影的主题分布:' + str(item_dis.shape))
print(item_dis)

plt1 = plt
plt1.plot(item_dis[:, 0], item_dis[:, 1], 'ro')
plt1.xlim((-1, 3))
plt1.ylim((-1, 3))
plt1.title(u'Item Distribution')#设置图的标题

count = 1
Пример #43
0
    pipe = Pipeline([('reduce_dim', PCA()), ('classify', LinearSVC())])

    N_EXPERIMENTS = 5
    N_FEATURES_OPTIONS = [4]
    C_OPTIONS = [1, 10, 100, 1000]

    reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

    non_nested_scores = np.zeros(N_EXPERIMENTS)
    nested_scores = np.zeros(N_EXPERIMENTS)

    ############################################################

    param_grid = [
        {
            'reduce_dim': [PCA(iterated_power=7), NMF()],
            'reduce_dim__n_components': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
        {
            'reduce_dim': [SelectKBest(chi2)],
            'reduce_dim__k': N_FEATURES_OPTIONS,
            'classify__C': C_OPTIONS
        },
    ]

    print('Grid Search experiments... ')
    start = time()
    for ith_exp in range(N_EXPERIMENTS):

        # CV technique
Пример #44
0
# plot the mean cross-validation scores
mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , 
                           xticklabels = param_grid [ 'svm__C' ],
                           ylabel = 'svm__gamma' , 
                           yticklabels = param_grid [ 'svm__gamma' ], cmap = "viridis" )


"""-----------------------------------------------------------------------------"""

"""==========================================================================================="""

"""-----------------------------------------------------------------------------"""

"""NMF pre-processing with SVC algorithm """
##Pipelines in Grid Searches
pipe = Pipeline([("scaler", NMF()), ("svm", SVC())])
param_grid = { 'scaler__n_components' : [5],
         'svm__C' : [0.00001, 0.1],
              'svm__gamma' : [0.00001, 0.1]}
grid = GridSearchCV(pipe, param_grid = param_grid, cv = 5)
grid.fit(X_train,y_train )
pred = grid.predict(X_test)
print("NMF pre-processing with SVC algorithm")
print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
print("Test set accuracy: {:.2f}".format(grid.score(X_test,y_test)))
print("f1 score: {:.2f}".format(f1_score(y_test,pred)))
print("Best parameters: {}".format(grid.best_params_))
print ( classification_report ( y_test, pred, target_names = [ "mol" , "no_mol" ]))
scores = grid.cv_results_ [ 'mean_test_score' ] . reshape ( 2,2 )
# plot the mean cross-validation scores
mglearn . tools . heatmap ( scores , xlabel = 'svm__C' , 
Пример #45
0
print "Extracting tf-idf features for NMF..."
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(posts)

print "Extracting tf features for LDA..."
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(posts)

# cell 3 - Using NMF to get top topics
print "Fitting the NMF model with tf-idf features," "n_samples=%d and n_features=%d..." % (
    n_samples, n_features)
nmf = NMF(n_components=n_topics, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)

print "\nTopics in NMF model:"
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# cell 4 - Using LDA to get top topics
print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (
    n_samples, n_features)
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
Пример #46
0
model.add(e)
model.add(Flatten())
model.add(Dense(10173, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(13, activation='softmax'))

model.compile(optimizer='Adadelta', loss='categorical_crossentropy', metrics=['acc'])

history = model.fit(tfidf, y_label, epochs=20, verbose=1,validation_split=0.3)



# Run NMF
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 13
nmf = NMF(n_components=no_topics, init='nndsvd').fit(tfidf)
W = nmf.fit_transform(tfidf)
H = nmf.components_

# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, learning_method='online', learning_offset=50.).fit(tf)
W_lda = lda.fit_transform(tf)
H_lda = lda.components_


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
Пример #47
0
    def handle(self, *args, **options):
        parent_run_id = options['run_id']
        K = options['K']
        nWords = 50  #options['nWords']
        fileDest = ""  #options['fileDest']

        parent_stat = RunStats.objects.get(pk=parent_run_id)

        n_features = parent_stat.max_features

        if fileDest == '':

            run_id = init(n_features)
            stat = RunStats.objects.get(run_id=run_id)
            stat.query = Query.objects.get(pk=parent_stat.query.id)
            stat.method = "DT"
            stat.parent_run_id = parent_run_id
            stat.save()

        for tp in parent_stat.periods.all():
            stat.periods.add(tp)

        tops = Topic.objects.filter(run_id=parent_run_id,
                                    topicterm__isnull=False).distinct()
        terms = Term.objects.all()

        B = np.zeros((tops.count(), terms.count()))

        wt = 0
        for topic in tops:
            tts = TopicTerm.objects.filter(
                topic=topic).order_by('-score')[:nWords]
            if len(tts) == 0:
                if fileDest != '':
                    print(wt)
                    continue
                print(topic)
            for tt in tts:
                B[wt, tt.term.id] = tt.score * np.log1p(topic.score)
            wt += 1

        col_sum = np.sum(B, axis=0)
        vocab_ids = np.flatnonzero(col_sum)

        row_sum = np.sum(B, axis=1)
        top_ids = np.flatnonzero(row_sum)

        print(np.where(~B.any(axis=1)))

        # we only want the columns where there are at least some
        # topic-term values
        B = B[:, vocab_ids]

        print(B.shape)

        print(np.where(~B.any(axis=1)))

        if fileDest != '':
            np.save(fileDest, B)
            sys.exit()

        nmf = NMF(n_components=K, random_state=1, alpha=.1, l1_ratio=.5).fit(B)

        ## Add dynamic topics
        dtopics = []
        for k in range(K):
            dtopic = DynamicTopic(run_id=RunStats.objects.get(pk=run_id))
            dtopic.save()
            dtopics.append(dtopic)

        dtopic_ids = list(
            DynamicTopic.objects.filter(run_id=run_id).values_list('id',
                                                                   flat=True))

        print(dtopic_ids)

        ##################
        ## Add the dtopic*term matrix to the db
        print("Adding topicterms to db")
        t0 = time()
        ldalambda = find(csr_matrix(nmf.components_))
        topics = range(len(ldalambda[0]))
        tts = []
        pool = Pool(processes=8)
        tts.append(
            pool.map(
                partial(f_dlambda,
                        m=ldalambda,
                        v_ids=vocab_ids,
                        t_ids=dtopic_ids,
                        run_id=run_id), topics))
        pool.terminate()
        tts = flatten(tts)
        gc.collect()
        sys.stdout.flush()
        django.db.connections.close_all()
        DynamicTopicTerm.objects.bulk_create(tts)
        print("done in %0.3fs." % (time() - t0))

        ## Add the wtopic*dtopic matrix to the database
        gamma = nmf.transform(B)

        for topic in range(len(gamma)):
            for dtopic in range(len(gamma[topic])):
                if gamma[topic][dtopic] > 0:
                    tdt = TopicDTopic(topic=tops[topic],
                                      dynamictopic_id=dtopic_ids[dtopic],
                                      score=gamma[topic][dtopic])
                    tdt.save()

        ## Calculate the primary dtopic for each topic
        for t in tops:
            try:
                t.primary_dtopic = TopicDTopic.objects.filter(
                    topic=t).order_by('-score').first().dynamictopic
                t.save()
            except:
                pass

        stat.error = parent_stat.error + nmf.reconstruction_err_
        stat.errortype = "Frobenius"
        stat.last_update = timezone.now()
        stat.save()
        print("updating and summarising run, {}".format(run_id))
        management.call_command('update_run', run_id)

        management.call_command('update_run', run_id)
Пример #48
0
                             stop_words='english',
                             lowercase=True,
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(wines["processed_description"])

NUM_TOPICS = 10

# Latent Dirichlet Allocation Model
lda = LatentDirichletAllocation(n_components=NUM_TOPICS,
                                max_iter=10,
                                learning_method='online',
                                verbose=True)
data_lda = lda.fit_transform(data_vectorized)

# Non-Negative Matrix Factorization Model
nmf = NMF(n_components=NUM_TOPICS)
data_nmf = nmf.fit_transform(data_vectorized)

# Latent Semantic Indexing Model using Truncated SVD
lsi = TruncatedSVD(n_components=NUM_TOPICS)
data_lsi = lsi.fit_transform(data_vectorized)


# Functions for printing keywords for each topic
def selected_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
               for i in topic.argsort()[:-top_n - 1:-1]])

Пример #49
0
def ldatopicmodeling(sentencetuples, searchobject):
    """

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
		When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
		When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.

	see sample results at end of file

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    maxfeatures = 2000
    components = 15
    topwords = 15

    maxfreq = .60
    minfreq = 5
    iterations = 12

    mustbelongerthan = 2

    sentencetuples = [
        s for s in sentencetuples
        if len(s[1].strip().split(' ')) > mustbelongerthan
    ]
    sentences = [s[1] for s in sentencetuples]

    sentences = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentences for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    bagsofwords = buildwordbags(searchobject, morphdict, sentences)

    bagsofsentences = [' '.join(b) for b in bagsofwords]

    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=maxfreq,
                                    min_df=minfreq,
                                    max_features=maxfeatures)

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    lda = LatentDirichletAllocation(n_components=components,
                                    max_iter=iterations,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    lda.fit(ldavectorized)

    print("\nTopics in LDA model:")
    tf_feature_names = ldavectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, topwords)

    # Use tf-idf features for NMF.
    tfidfvectorizer = TfidfVectorizer(max_df=0.95,
                                      min_df=2,
                                      max_features=maxfeatures)

    tfidf = tfidfvectorizer.fit_transform(bagsofsentences)

    # Fit the NMF model
    nmf = NMF(n_components=components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (Frobenius norm):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (len(sentences), maxfeatures))

    nmf = NMF(n_components=components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    return
Пример #50
0
def gen_decomposition_stats_vector_ftr51(stats_name,
                                         size='7d',
                                         non_zero=False,
                                         decomp_method='lda',
                                         n_components=5):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d
    :param non_zero: bool, 统计是否非0
    :param decomp_method: str, 分解方法
    :param n_components: int , 分解之后的维度
    :return:
    """
    assert decomp_method in ['svd', 'nmf', 'lda']
    mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero
    assert not mask
    matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero)
    # 0 读取数据

    ftr51_stats_sparse_matrix = sparse.load_npz(
        get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray()

    if decomp_method == 'svd':
        print(' svd decomposition...')
        svd = TruncatedSVD(n_components=n_components,
                           n_iter=50,
                           random_state=42)
        ftr51_stats_matrix_decomp = svd.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'nmf':
        print(' nmf decomposition...')
        nmf = NMF(n_components=n_components,
                  init='random',
                  random_state=0,
                  max_iter=200)
        ftr51_stats_matrix_decomp = nmf.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'lda':
        print(' lda decomposition...')
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=50,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0,
                                        n_jobs=1)
        ftr51_stats_matrix_decomp = lda.fit_transform(
            ftr51_stats_sparse_matrix)
        joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size))

    columns = [
        '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size,
                                             non_zero, n_components, j)
        for j in range(ftr51_stats_matrix_decomp.shape[1])
    ]
    stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns)
    train = stats_df[:15000].reset_index(drop=True)
    test = stats_df[15000:].reset_index(drop=True)
    for feature in columns:
        SaveFeature(train, test, feature)

    return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format(
        stats_name, size, non_zero, decomp_method, n_components)
class TweetAnalyzer:

    def __init__(self, tweets=None):
        if not tweets:
            try:
                with jsonlines.open(TWEETS_FILE) as reader:
                    self.tweets = [tweet for tweet in reader]
                    print('Loaded {} tweets fron {}'.format(
                        len(self.tweets), TWEETS_FILE))
            except FileNotFoundError:
                print("Can't find the tweets file")
            except Exception as e:
                print(e)
        else:
            self.tweets = tweets

        # Extract the keys from the first tweet and spread them into a list
        columns = [*self.tweets[0]]
        self.tfidf_result = None
        self.feature_names = None
        self.df = pd.DataFrame(self.tweets, columns=columns)
        self.clean_tweets()
        if DEBUG:
            print(self.df.head())

    def clean_tweets(self):
        start = timer()
        self.df.text = self.df.text.apply(TweetPreprocessor.strip_links)
        self.df.text = self.df.text.apply(TweetPreprocessor.strip_mentions)
        self.df.text = self.df.text.apply(TweetPreprocessor.strip_hashtags)
        self.df.text = self.df.text.apply(TweetPreprocessor.strip_rt)
        self.df.text = self.df.text.apply(
            TweetPreprocessor.remove_special_characters)
        end = timer()
        print('Cleaned tweets in {}'.format(end - start))

    def vectorize(self):
        self.vectorizer = TfidfVectorizer(stop_words='english')
        self.tfidf_result = self.vectorizer.fit_transform(self.df['text'])
        self.feature_names = self.vectorizer.get_feature_names()

    def top_n(self, top=100):
        if self.feature_names is None or self.tfidf_result is None:
            print('Must run vectorize() first before calling top_n')
            return

        scores = zip(self.feature_names,
                     np.asarray(self.tfidf_result.sum(axis=0)).ravel())

        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

        labels, scores = [], []

        # Get the scores and labels of the top 100 tweets
        for item in sorted_scores[:top]:
            print("{0:50} Score: {1}".format(item[0], item[1]))
            # sns.distplot(item[1], label=item[0])
            labels.append(item[0])
            scores.append(item[1])

        index = np.arange(len(scores))
        plt.bar(index, scores)
        plt.xlabel('Word', fontsize=12)
        plt.ylabel('TFIDF Score', fontsize=12)
        plt.xticks(index, labels, fontsize=8, rotation=90)
        plt.title('Top {} features'.format(top))
        plt.savefig('Top_{}'.format(top))

    def topic_model(self, num_topics=10):
        if DEBUG:
            print('Performing topic modeling with {} topics'.format(num_topics))

        # Build a Latent Dirichlet Allocation Model
        self.lda_model = LatentDirichletAllocation(n_topics=num_topics, max_iter=10, learning_method='online')
        lda_Z = self.lda_model.fit_transform(self.tfidf_result)
        print('LDA shape: ')
        print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

        # Build a Non-Negative Matrix Factorization Model
        self.nmf_model = NMF(n_components=num_topics)
        nmf_Z = self.nmf_model.fit_transform(self.tfidf_result)
        print('NMF shape: ')
        print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

        # Build a Latent Semantic Indexing Model
        self.lsi_model = TruncatedSVD(n_components=num_topics)
        lsi_Z = self.lsi_model.fit_transform(self.tfidf_result)
        print('LSI shape: ')
        print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

        if DEBUG:
            # Let's see how the first document in the corpus looks like in different topic spaces
            print("LDA Model:")
            self.print_topics(self.lda_model)
            print("=" * 20)

            print("NMF Model:")
            self.print_topics(self.nmf_model)
            print("=" * 20)

            print("LSI Model:")
            self.print_topics(self.lsi_model)
            print("=" * 20)

    # Helper function to print topics
    def print_topics(self, model, top_n=10):
        for idx, topic in enumerate(model.components_):
            print("Topic %d:" % (idx))
            print([(self.vectorizer.get_feature_names()[i], topic[i])
                            for i in topic.argsort()[:-top_n - 1:-1]])

    def plot_topic_model_SVD(self):
        from bokeh.io import push_notebook, show, output_notebook
        from bokeh.plotting import figure
        from bokeh.models import ColumnDataSource, LabelSet
        output_notebook()

        self.svd = TruncatedSVD(n_components=2)
        words_2d = self.svd.fit_transform(self.tfidf_result.T)

        df = pd.DataFrame(columns=['x', 'y', 'word'])
        df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], self.feature_names

        source = ColumnDataSource(ColumnDataSource.from_df(df))
        labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                        text_font_size="8pt", text_color="#555555",
                        source=source, text_align='center')

        plot = figure(plot_width=600, plot_height=600)
        plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
        plot.add_layout(labels)
        show(plot, notebook_handle=True)
Пример #52
0
E_symbol = np.asarray(E_symbol)
P_symbol = np.asarray(P_symbol)
E = pd.DataFrame(E)
PeakO = pd.DataFrame(PeakO)
E = quantileNormalize(E)
PeakO = quantileNormalize(PeakO)

print("Initializing non-negative matrix factorization for E...")
E[E > 10000] = 10000
X = np.log(1 + E)

err1 = np.zeros(rep)
for i in range(0, rep):
    model = NMF(n_components=K,
                init='random',
                random_state=i,
                solver='cd',
                max_iter=50)
    W20 = model.fit_transform(X)
    H20 = model.components_
    err1[i] = LA.norm(X - np.dot(W20, H20), ord='fro')

model = NMF(n_components=K,
            init='random',
            random_state=np.argmin(err1),
            solver='cd',
            max_iter=1000)
W20 = model.fit_transform(X)
H20 = model.components_
S20 = np.argmax(H20, 0)
Пример #53
0
y = x_p[:, 1]

plt.figure()
plt.title('apres la methode pca')
plt.scatter(x, y, c=label)
plt.xlabel('dimension 1')
plt.ylabel('diemnsion 2')

# <p style="color:green">Donc la couleur jaune représente les personnes mort </p>
# <i style="color:blue">On peut aussi utiliser la méthode NMF</i>

# In[31]:

from sklearn.decomposition import NMF

nmf = NMF(n_components=2)
x_n = nmf.fit(data).transform(data)
print(x_n)
x = x_n[:, 0]
y = x_n[:, 1]

plt.figure()
plt.title('apres la methode NMF')
plt.scatter(x, y, c=label)
plt.xlabel('dimension 1')
plt.ylabel('diemnsion 2')

# <h3 style="color:#8080C0">
# Dans la suite, nous utilisons une méthode d'apprentissage automatique afin de prédire la classe : les patients sont soit «décédés» (‘died’) soit «sortis» (‘discharged’) de l'hôpital. Vous pouvez utiliser la classification par K-Nearest Neighbours (K-NN), l’arbre de decision ou le classificateur Bayes.</h3>

# In[42]:
Пример #54
0
 def lanchNMF(self):
     model = NMF(n_components=3, init='random', random_state=0)
     self.nmf_ = model.fit_transform(self.img)
Пример #55
0
import numpy as np
from sklearn.decomposition import NMF,TruncatedSVD,ProjectedGradientNMF
model = NMF(n_components=2, alpha=0.01)

#Store AD
ad_ID_dict   = {}
#ad_list = []
#ad_list = list(ad_list)


#Assign ID number
ad_ID = 0
user_ID = 0

max_feature = 0


#ad_ID for ad_nmu
adID_for_num = {}


with open ('ad_ID.dat') as file:
    for line in file:
        data = line.strip('\n').split('   ')
        #print(data)
        adID_for_num[int(data[1])] = int(data[0])

file.close()


Пример #56
0
# Challenge 1
#%%
import numpy as np
np.set_printoptions(threshold=np.inf)
from sklearn.decomposition import NMF

M = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4],
     [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2],
     [1, 5, 3, 5, None, 5, 5]]

M1 = [[4, 4, 2, 2, 3, 1, 1], [1, 5, 5, 2, 1, 4, 5], [1, 5, 1, 1, 4, 1, 4],
      [5, 4, 3, 1, 1, 1, 2], [1, 4, 4, 1, 1, 5, 5], [5, 5, 3, 5, 5, 1, 2]]
M2 = [[4, 4, 2, 2, 1, 1], [1, 5, 5, 2, 4, 5], [1, 5, 1, 1, 1, 4],
      [5, 4, 3, 1, 1, 2], [1, 4, 4, 1, 5, 5], [5, 5, 3, 5, 1, 2],
      [1, 5, 3, 5, 5, 5]]
model1 = NMF(n_components=3)
model1.fit(M1)
W2 = model1.fit_transform(M2)
H2 = model1.components_
W1 = model1.fit_transform(M1)
H1 = model1.components_
print(np.matmul(W2, H1))
# Challenge 2
#%%

# Lloyd’s algorithm
import random
import matplotlib.pyplot as plt


class lloyds(object):
Пример #57
0
    start_time = time.time()
    U_50, sigma_50, Vt_50 = svds(demeaned_input, k=50)
    sigma_50 = np.diag(sigma_50)
    svd_50_prediction = np.dot(np.dot(U_50, sigma_50), Vt_50) + user_mean
    end_time = time.time()

    svd_50_HR10 = test.hit_rate(svd_50_prediction[len(train_data):], last_item,
                                10)
    svd_50_HR25 = test.hit_rate(svd_50_prediction[len(train_data):], last_item,
                                25)
    svd_50_arhr = test.arhr(svd_50_prediction[len(train_data):], last_item)
    svd_50_time = end_time - start_time

    # NMF
    start_time = time.time()
    nmf = NMF(2)
    W = nmf.fit_transform(entire_data)
    H = nmf.components_
    nmf_prediction = np.dot(W, H)
    end_time = time.time()

    nmf_HR10 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 10)
    nmf_HR25 = test.hit_rate(nmf_prediction[len(train_data):], last_item, 25)
    nmf_arhr = test.arhr(nmf_prediction[len(train_data):], last_item)
    nmf_time = end_time - start_time

    # print tabulated result
    table = tabulate(
        [[
            'HR10', dhrbm_HR10, itempop_HR10, itempop_cluster_HR10,
            svd_10_HR10, svd_50_HR10, nmf_HR10
image_shape = people.images[0].shape

mask = np.zeros(people.target.shape, dtype=np.bool)
for target in np.unique(people.target):
    mask[np.where(people.target == target)[0][:50]] = 1
X_people = people.data[mask]
y_people = people.target[mask]
X_people = X_people / 255.
X_train, X_test, y_train, y_test = train_test_split( \
        X_people, y_people, stratify=y_people, random_state=0)

mglearn.plots.plot_nmf_illustration()
mglearn.plots.plot_nmf_faces(X_train, X_test, image_shape)

from sklearn.decomposition import NMF
nmf = NMF(n_components=15, random_state=0)
nmf.fit(X_train)
X_train_nmf = nmf.transform(X_train)
X_test_nmf = nmf.transform(X_test)

fix, axes = plt.subplots(3, 5, figsize=(15, 12), \
            subplot_kw={'xticks': (), 'yticks': ()})
for i, (component, ax) in enumerate(zip(nmf.components_, axes.ravel())):
    ax.imshow(component.reshape(image_shape))
    ax.set_title("{}. component".format(i))

# display the data that has large weighting for comp
compn = 11
inds = np.argsort(X_train_nmf[:, compn])[::-1]
fix, axes = plt.subplots(2, 5, figsize=(15, 8), \
            subplot_kw={'xticks': (), 'yticks': ()})
    def test_custom_nmf(self):

        mat = np.array([[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0],
                        [1, 0, 0, 0], [1, 0, 0, 0]], dtype=np.float64)
        mat[:mat.shape[1], :] += np.identity(mat.shape[1])

        mod = NMF(n_components=2)
        W = mod.fit_transform(mat)
        H = mod.components_

        def predict(W, H, row_index, col_index):
            return np.dot(W[row_index, :], H[:, col_index])

        pred = mod.inverse_transform(W)

        exp = []
        got = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                exp.append((i, j, pred[i, j]))
                got.append((i, j, predict(W, H, i, j)))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, got))
        assert max_diff <= 1e-5

        def nmf_to_onnx(W, H):
            """
            The function converts a NMF described by matrices
            *W*, *H* (*WH* approximate training data *M*).
            into a function which takes two indices *(i, j)*
            and returns the predictions for it. It assumes
            these indices applies on the training data.
            """
            col = OnnxArrayFeatureExtractor(H, 'col')
            row = OnnxArrayFeatureExtractor(W.T, 'row')
            dot = OnnxMul(col, row, op_version=TARGET_OPSET)
            res = OnnxReduceSum(dot, output_names="rec",
                                op_version=TARGET_OPSET)
            indices_type = np.array([0], dtype=np.int64)
            onx = res.to_onnx(inputs={'col': indices_type,
                                      'row': indices_type},
                              outputs=[('rec', FloatTensorType((None, 1)))])
            return onx

        model_onnx = nmf_to_onnx(W, H)
        sess = InferenceSession(model_onnx.SerializeToString())

        def predict_onnx(sess, row_indices, col_indices):
            res = sess.run(None,
                           {'col': col_indices,
                            'row': row_indices})
            return res

        onnx_preds = []
        for i in range(mat.shape[0]):
            for j in range(mat.shape[1]):
                row_indices = np.array([i], dtype=np.int64)
                col_indices = np.array([j], dtype=np.int64)
                pred = predict_onnx(sess, row_indices, col_indices)[0]
                onnx_preds.append((i, j, pred[0, 0]))

        max_diff = max(abs(e[2] - o[2]) for e, o in zip(exp, onnx_preds))
        assert max_diff <= 1e-5
Пример #60
0
def plot_optimal_k(docs, document_term_mat, vectorizer,
                   kmin=3, kmax=15, num_top_terms=15,
                   alpha=.1, l1_ratio=.5,
                   dim_size=500, min_df=20, max_vocab_size=5000,
                   model_file_path='./data/',
                   model_file_name='w2v-model.bin'):
    '''
    Run NMF for each k between min and max and plot to assess optimal k.

    Input
        docs - corpus of docuemnts as a list
        document_term_mat - TFIDF matrix from the vectorizer
        vectorizer - scikit-learn TFIDF vectorizer (trained in TopicModeller)

    Returns:
        Int - optimal k number
    '''
    topic_models = []

    # Run NMF for each value of k
    for k in range(kmin, kmax+1):
        t1 = time.time()

        # Run NMF
        model = NMF(n_components=k, init='nndsvd',
                    alpha=alpha, l1_ratio=l1_ratio)

        W = model.fit_transform(document_term_mat)
        H = model.components_

        # Store for iterating over all the models (of each k size)
        topic_models.append((k, W, H))

        print("Processed NMF for k=%d of %d - Time: %0.3fs." % (k, kmax, (time.time() - t1)), end='\r', flush=True)
    print()

    # If the model is already built get it from disk, otherwise
    # build a Skipgram Word2Vec model from all documents
    # in the input file using Gensim:
    model_path = model_file_path + model_file_name
    if not os.path.exists(model_file_path):
        os.makedirs(model_file_path)

    w2v_model = None
    try:
        w2v_model = gensim.models.Word2Vec.load(model_path)
    except Exception as e:
        print('No existing word2vec model found to load. Exception: %s.\n'
              'Building it...' % (e))

    # w2v_model = None - uncomment to force rebuild every time
    if w2v_model:
        print('Existing word2vec Model loaded from \'%s\'' % model_path)
    else:
        docgen = nlp_utils.TokenGenerator(docs)
        # Process w2v with model of n dimensions and min doc-term freq as min_df
        t1 = time.time()
        w2v_model = gensim.models.Word2Vec(docgen, sg=1, size=dim_size,
                                           max_vocab_size=max_vocab_size,
                                           min_count=min_df)
        print("- Time: %0.3fs." % (time.time() - t1))
        # Save for later use, so that we do not need to rebuild it:
        print('Saving it...')
        w2v_model.save(model_path)

    print(('word2vec model has %d terms' % len(w2v_model.wv.vocab)))

    # Implement TC-W2V coherence score measure
    def calculate_coherence(w2v_model, term_rankings):
        overall_coherence = 0.0
        for topic_index in range(len(term_rankings)):
            # check each pair of terms
            pair_scores = []
            # print 'Topic %s: %s top words: %s' % (topic_index,
            #                                       len(term_rankings[topic_index]),
            #                                       term_rankings[topic_index])
            for pair in combinations(term_rankings[topic_index], 2):
                pair_scores.append(w2v_model.similarity(pair[0], pair[1]))
            # get the mean for all pairs in this topic
            topic_score = sum(pair_scores) / len(pair_scores)
            overall_coherence += topic_score
        # get the mean score across all topics
        return overall_coherence / len(term_rankings)

    # Function to get the topic descriptor
    # (i.e. list of top terms) for each topic:
    def get_descriptor(all_terms, H, topic_index, num_top_terms):
        # reverse sort the values to sort the indices
        top_indices = np.argsort(H[topic_index, :])[::-1]
        # now get the terms corresponding to the top-ranked indices
        top_terms = []
        for term_index in top_indices[0:num_top_terms]:
            top_terms.append(all_terms[term_index])
        return top_terms

    # Process each of the models for different values of k:
    vocab = vectorizer.get_feature_names()
    # vocab = w2v_model.wv.vocab

    # Process each of the models for different values of k:
    k_values = []
    coherences = []
    print('Calculating coherence scores...')
    for (k, W, H) in topic_models:
        # Get all topic descriptors - the term_rankings, based on top n terms
        term_rankings = []
        for topic_index in range(k):
            # term_rankings.append(get_descriptor(vocab, H, topic_index, num_top_terms))
            top_words = [vocab[i] for i in H[topic_index, :].argsort()[:-num_top_terms - 1:-1]]
            top_words = [x for x in top_words if x in w2v_model.wv.vocab]
            term_rankings.append(top_words)
        # Calculate the coherence based on our Word2vec model
        k_values.append(k)
        coherences.append(calculate_coherence(w2v_model, term_rankings))
        # print(('K=%02d: Coherence=%.4f' % (k, coherences[-1])))

    # Plot a line of coherence scores to identify an appropriate k value.
    plt.style.use("ggplot")
    matplotlib.rcParams.update({"font.size": 14})
    fig = plt.figure(figsize=(13, 7))
    # Create the line plot
    ax = plt.plot(k_values, coherences)
    plt.xticks(k_values)
    plt.xlabel("Number of Topics")
    plt.ylabel("Mean Coherence")
    # Add the points
    plt.scatter(k_values, coherences, s=120)
    # Find and annotate the maximum point on the plot
    ymax = max(coherences)
    xpos = coherences.index(ymax)
    best_k = k_values[xpos]
    plt.annotate('k=%d' % best_k, xy=(best_k, ymax), xytext=(best_k, ymax),
                 textcoords="offset points", fontsize=16)
    print('Optimal number of k topics: %s' % best_k)
    # Show the plot
    plt.show()

    k = best_k
    # Get the model that we generated earlier.
    W = topic_models[k-kmin][1]
    H = topic_models[k-kmin][2]

    # Display the topics and descriptor words for the best k model
    for topic_index in range(k):
        descriptor = get_descriptor(vectorizer.get_feature_names(),
                                    H, topic_index, num_top_terms)
        str_descriptor = ", ".join(descriptor)
        print(("Topic %02d: %s" % (topic_index, str_descriptor)))

    return int(k)