def topic_count_by_date_range(self, table, start_date, end_date, doc_topic_threshold=.1, only_best_match=True): ''' Returns a count of articles that match each topic above a certain threshold of similarity. More granular and human-interpretable than topic_freq_by_date_range. If only_best_match: counts articles for which that topic is the best match. Else: counts any article above that threshold per topic. INPUT: mongo-collection - table, string - start_date, string - end_date, float - doc_topic_threshold, bool - only_best_match OUTPUT: np array - count of matching articles per topic ''' q = {'pub_date': {'$gte': start_date, '$lte': end_date}} docs = just_clean_text(table, q) article_ids = np.array([d[0] for d in docs]) texts = [d[1] for d in docs] article_lengths = _get_article_lengths(texts) X = self.vectorizer.transform(texts) doc_topic_freqs = X.dot(self.H.T) / article_lengths if only_best_match: best_matches = Counter(doc_topic_freqs.argmax(axis=1)) return np.array([best_matches[i] for i in range(self.num_topics)]) matches = doc_topic_freqs > doc_topic_threshold return matches.sum(axis=0)
def topic_freq_by_date_range(self, table, start_date, end_date, n_articles=1, topic_freq_threshold=.1): ''' Get topic frequencies for all records in a date range. Also returns the highest-matching document(s) if that topic's relative frequency is above the topic_freq_threshold. INPUT: mongo-collection - table, string - start_date, string - end_date, int - n_articles, float - topic_freq_threshold OUTPUT: list - (topic index, topic frequency, example article(s)) tuples ''' q = {'pub_date': {'$gte': start_date, '$lte': end_date}} docs = just_clean_text(table, q) article_ids = np.array([d[0] for d in docs]) X = self.vectorizer.transform([d[1] for d in docs]) doc_topic_freqs = X.dot(self.H.T) total_topic_freqs = _normalize_frequencies(doc_topic_freqs.sum(axis=0)) output = [None] * self.num_topics for t in range(self.num_topics): if total_topic_freqs[t] > topic_freq_threshold: tops = np.argsort(doc_topic_freqs[:, t])[::-1][:n_articles] output[t] = (t, total_topic_freqs[t], article_ids[tops]) else: output[t] = (t, total_topic_freqs[t], None) return output