示例#1
0
    def topic_count_by_date_range(self, table, start_date, end_date,
                                  doc_topic_threshold=.1,
                                  only_best_match=True):
        '''
        Returns a count of articles that match each topic above a certain
            threshold of similarity. More granular and human-interpretable
            than topic_freq_by_date_range. If only_best_match: counts
            articles for which that topic is the best match. Else: counts
            any article above that threshold per topic.

        INPUT:  mongo-collection - table, string - start_date,
                string - end_date, float - doc_topic_threshold,
                bool - only_best_match
        OUTPUT: np array - count of matching articles per topic
        '''
        q = {'pub_date': {'$gte': start_date, '$lte': end_date}}
        docs = just_clean_text(table, q)
        article_ids = np.array([d[0] for d in docs])
        texts = [d[1] for d in docs]
        article_lengths = _get_article_lengths(texts)
        X = self.vectorizer.transform(texts)
        doc_topic_freqs = X.dot(self.H.T) / article_lengths
        if only_best_match:
            best_matches = Counter(doc_topic_freqs.argmax(axis=1))
            return np.array([best_matches[i] for i in range(self.num_topics)])
        matches = doc_topic_freqs > doc_topic_threshold
        return matches.sum(axis=0)
示例#2
0
    def topic_freq_by_date_range(self, table, start_date, end_date,
                                 n_articles=1, topic_freq_threshold=.1):
        '''
        Get topic frequencies for all records in a date range. Also returns
            the highest-matching document(s) if that topic's relative
            frequency is above the topic_freq_threshold.

        INPUT:  mongo-collection - table, string - start_date,
                string - end_date, int - n_articles,
                float - topic_freq_threshold
        OUTPUT: list - (topic index, topic frequency, example
                article(s)) tuples
        '''
        q = {'pub_date': {'$gte': start_date, '$lte': end_date}}
        docs = just_clean_text(table, q)
        article_ids = np.array([d[0] for d in docs])
        X = self.vectorizer.transform([d[1] for d in docs])
        doc_topic_freqs = X.dot(self.H.T)
        total_topic_freqs = _normalize_frequencies(doc_topic_freqs.sum(axis=0))
        output = [None] * self.num_topics
        for t in range(self.num_topics):
            if total_topic_freqs[t] > topic_freq_threshold:
                tops = np.argsort(doc_topic_freqs[:, t])[::-1][:n_articles]
                output[t] = (t, total_topic_freqs[t], article_ids[tops])
            else:
                output[t] = (t, total_topic_freqs[t], None)
        return output