def nmf(filename): # tf-idf min_df = 10 max_df = 0.60 max_features = 5000 # sklearn-model nmf # get data newspaper_input = pd.read_csv(filename, na_filter=False) newspaper_input['processed_text'] = newspaper_input['article'].apply( process_text) texts = newspaper_input['processed_text'] # create dictionary to pass as input to gensim model dictionary = Dictionary(newspaper_input['processed_text']) # filter out words that are above or below the thresholds set dictionary.filter_extremes(no_below=10, no_above=0.60, keep_n=5000) # convert to bag of words (corpus) to pass to gensim nmf model # [[(word_id, # times word appears in document),...],...] corpus = [dictionary.doc2bow(text) for text in texts] # find optimal number of topics using gensim NMF https://radimrehurek.com/gensim/models/nmf.html # testing topic numbers 10,15,20...55 to find best number to fit the data topic_nums = list(np.arange(10, 56, 5)) coherence_scores = [] for num in topic_nums: # initialize NMF model nmf = Nmf( corpus=corpus, num_topics=num, id2word=dictionary, chunksize= 500, #Number of documents to be used in each training chunk passes=10, #Number of full passes over the training corpus kappa=0.1, #Gradient descent step size minimum_probability=0.001, w_max_iter= 300, # Maximum number of iterations to train W per each batch w_stop_condition= 0.0001, # If error difference gets less than that, training of W stops for the current batch h_max_iter=100, h_stop_condition=0.001, normalize=True, random_state=42) # initialize Coherence Model https://radimrehurek.com/gensim/models/coherencemodel.html # Calculate topic coherence for topic models cm = CoherenceModel(model=nmf, texts=texts, dictionary=dictionary, coherence='c_v') coherence_scores.append(round(cm.get_coherence(), 5)) # get list of different topic numbers and their respective scores scores = list(zip(topic_nums, coherence_scores)) # sort scores by score (not topic_num) scores = sorted(scores, key=itemgetter(1), reverse=True) # get the best number of topics best_num_topics, best_coherence_score = scores[0] # best_coherence_score = scores[0][1] print('scores: ', scores) print('num_topics: ', str(best_num_topics)) print('coherence: ', str(best_coherence_score)) # print(df.head()) tfidf_vectorizer = TfidfVectorizer( ngram_range=(1, 2), # unigrams and bigrams max_df=0.60, min_df=10, max_features=5000, preprocessor=' '.join) # fit+transform: returns document-term matrix (frequency of word i in document j) tfidf = tfidf_vectorizer.fit_transform(texts) # all the words we'll be looking at tfidf_fn = tfidf_vectorizer.get_feature_names() # grid search for best alpha, l1_ratio combination # measured by lowest sum squared residual # l1_ratio: regularization mixing parameter (0 => l2 penalty, 1 => l1 penalty, (0,1) => mixture) # alpha: constant that multiplies the regularization terms (0 => no regularization) squared_residuals = [] params = {} models = [] sorted_articles_dfs = [] complete_topics_dfs = [] alphas = list(np.arange(0.0, 1.2, 0.2)) l1_ratios = list(np.arange(0.0, 1.2, 0.2)) count_params = 0 successes = 0 count_successes = {} for a in alphas: for b in l1_ratios: # print('alpha: {}, l1_ratio: {}'.format(a,b)) # learn a model nmf = NMF( n_components=best_num_topics, init= 'nndsvd', # Non-negative double singular value decomposition max_iter=500, l1_ratio=b, solver='cd', # coordinate descent alpha=a, tol=0.0001, # 0.001 random_state=42).fit(tfidf) try: # transforms documents -> document-term matrix, transforms data according to model docweights = nmf.transform(tfidf) # (articles x topics) # topic dataframe: (best_num_topics x 8) # (topic num : top 8 words that describe the topic) n_top_words = 8 topic_df = topic_table(nmf, tfidf_fn, n_top_words).T # clean the topic words topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)], axis=1) topic_df['topics'] = topic_df['topics'].str[0] topic_df['topics'] = topic_df['topics'].apply( lambda x: whitespace_tokenizer(x)) topic_df['topics'] = topic_df['topics'].apply( lambda x: unique_words(x)) topic_df['topics'] = topic_df['topics'].apply( lambda x: [' '.join(x)]) topic_df['topics'] = topic_df['topics'].str[0] # clean topic dataframe topic_df = topic_df['topics'].reset_index() topic_df.columns = ['topic_num', 'topics'] topics = topic_df[['topic_num', 'topics']] # assign topics to each article title = newspaper_input['title'].tolist() df_temp = pd.DataFrame({ 'title': title, 'topic_num': docweights.argmax(axis=1) }) merged_topic = df_temp.merge(topic_df, on='topic_num', how='left') complete_df = merged_topic.merge(newspaper_input, on='title', how='left') # complete_df = complete_df.drop('processed_text', axis=1) # maybe unecessary ? complete_df = complete_df.drop_duplicates(subset=['title']) sorted_articles = complete_df.sort_values(by=['topic_num']) # get num articles per topic num_articles_per_topic = [] for topic in range(best_num_topics): count = 0 for index, row in sorted_articles.iterrows(): if row['topic_num'] == topic: count += 1 num_articles_per_topic.append(count) # keep track of how many articles are given each topic topics['num_articles'] = num_articles_per_topic # matrices from nmf (A = WH) mat_A = tfidf_vectorizer.transform(texts) mat_W = nmf.components_ mat_H = nmf.transform(mat_A) # residuals: measurement of how well the topics approximate the data (observed value - predicted value) # 0 -> topic perfectly predicts data # residual = Frobenius norm tf-idf weights (A) - coefficients of topics (H) X coefficients of topics (W) r = np.zeros(mat_A.shape[0]) # num articles for row in range(mat_A.shape[0]): r[row] = np.linalg.norm( mat_A[row, :] - mat_H[row, :].dot(mat_W), 'fro') sum_sqrt_res = round(sum(np.sqrt(r)), 3) squared_residuals.append(sum_sqrt_res) # add avg residual column to topics complete_df['resid'] = r sorted_articles = complete_df.sort_values(by=['topic_num']) resid_data = complete_df[[ 'topic_num', 'resid' ]].groupby('topic_num').mean().sort_values(by='resid') complete_topics = topics.merge(resid_data, on='topic_num', how='left') # save results sorted_articles_dfs.append(sorted_articles) complete_topics_dfs.append(complete_topics) models.append(nmf) count_successes[count_params] = successes successes += 1 except Exception as e: # print('test {}, error occurred'.format(count_params)) exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print(exc_type, fname, exc_tb.tb_lineno) # print('test {} complete'.format(count_params)) params[count_params] = (a, b) count_params += 1 # find best params params_test = np.arange(36) resid_scores = list(zip(params_test, squared_residuals)) resid_scores = sorted(resid_scores, key=itemgetter(1)) best_params = resid_scores[0][0] print('test #{} had best residual score'.format(best_params)) print('params: a={}, b={}'.format(params[best_params][0], params[best_params][1])) print('residual scores: {}'.format(resid_scores)) best_articles = sorted_articles_dfs[count_successes[best_params]] best_topics = complete_topics_dfs[count_successes[best_params]] # call function that uses svc model to predict category based on topic words best_topics = predict_category(best_topics) # save best topics for idx, row in best_topics.iterrows(): new_words = '' topics_itr = row['topics'].split() for word in topics_itr: new_words += get_unstemmed_word(word) new_words += ' ' best_topics.at[idx, 'topics'] = new_words categories = [] for idx, row in best_articles.iterrows(): topic_num = row['topic_num'] topics = best_topics.at[topic_num, 'topics'] categories.append(best_topics.at[topic_num, 'predicted_category']) best_articles.at[idx, 'topics'] = topics best_articles['predicted_category'] = categories best_articles = best_articles.drop('processed_text', axis=1) best_articles = best_articles.drop('Unnamed: 0', axis=1) best_articles.to_csv('../output/topic/articles_with_nmf_topics.csv', header=True, index=False) best_topics.to_csv('../output/topic/nmf_generated_topics.csv', header=True, index=False) # save model with open('nmf_model.pickle', 'wb') as output: pickle.dump(models[best_params], output) with open('nmf_tfidf.pickle', 'wb') as output: pickle.dump(tfidf_vectorizer, output)
tokens = tokenizer_regex.tokenize(text) return tokens # Funtion to remove duplicate words def unique_words(text): ulist = [] [ulist.append(x) for x in text if x not in ulist] return ulist # Use the top words for each cluster by tfidf weight # to create 'topics' # Getting a df with each topic by document docweights = nmf.transform(tfidf_vectorizer.transform(texts)) n_top_words = 8 topic_df = topic_table(nmf, tfidf_fn, n_top_words).T # Cleaning up the top words to create topic summaries topic_df['topics'] = topic_df.apply(lambda x: [' '.join(x)], axis=1) # Joining each word into a list topic_df['topics'] = topic_df['topics'].str[0] # Removing the list brackets topic_df['topics'] = topic_df['topics'].apply( lambda x: whitespace_tokenizer(x)) # tokenize topic_df['topics'] = topic_df['topics'].apply( lambda x: unique_words(x)) # Removing duplicate words topic_df['topics'] = topic_df['topics'].apply( lambda x: [' '.join(x)]) # Joining each word into a list