def analyse_people_sources(self): """ Do trend analysis on people. """ utterance_count = self.count_utterances(self.people.keys()) source_counts = self.source_frequencies(self.people.keys()) self.analysed_people = {} for pid, person in self.people.iteritems(): src = AnalysedSource() src.person = person src.utterance_count = utterance_count.get(src.person.id, 0) src.source_counts = source_counts[src.person.id] src.source_counts_total = sum(src.source_counts) self.analysed_people[pid] = src # normalize by total counts per day totals = [0] * (self.days+1) # first count per-day totals for src in self.analysed_people.itervalues(): for i, n in enumerate(src.source_counts): totals[i] += n # normalize for src in self.analysed_people.itervalues(): for i, n in enumerate(src.source_counts): if totals[i] == 0: src.source_counts[i] = 0 else: src.source_counts[i] = 100.0 * n / totals[i] # calculate trends # normalise source counts if self.analysed_people: biggest = max(src.source_counts_total for src in self.analysed_people.itervalues()) for src in self.analysed_people.itervalues(): src.source_counts_trend = moving_weighted_avg_zscore(src.source_counts, 0.8) src.source_counts_normalised = src.source_counts_total * 1.0 / biggest # top 20 sources self.top_people = sorted( self.analysed_people.itervalues(), key=lambda s: s.source_counts_total, reverse=True)[:20] # trends trending = sorted( self.analysed_people.itervalues(), key=lambda s: s.source_counts_trend) # top 10 trending up, most trending first self.people_trending_up = [s for s in trending[-10:] if s.source_counts_trend > self.TREND_UP] self.people_trending_up.reverse() # top 10 trending down, most trending first self.people_trending_down = [s for s in trending[:10] if s.source_counts_trend < self.TREND_DOWN]
def _analyse_people_mentions(self): """ Do trend analysis on people mentions. """ mention_counts = self.mention_frequencies(self.people.keys()) self.analysed_people = {} for pid, person in self.people.iteritems(): mention = AnalysedMention() mention.person = person mention.mention_counts = mention_counts[pid] mention.mention_counts_total = sum(mention.mention_counts) self.analysed_people[pid] = mention # normalize by total counts per day totals = [0] * (self.days+1) # first count per-day totals for topic in self.analysed_people.itervalues(): for i, n in enumerate(topic.mention_counts): totals[i] += n # normalize for topic in self.analysed_people.itervalues(): for i, n in enumerate(topic.mention_counts): if totals[i] == 0: topic.mention_counts[i] = 0 else: topic.mention_counts[i] = 100.0 * n / totals[i] # calculate trends for topic in self.analysed_people.itervalues(): topic.mention_counts_trend = moving_weighted_avg_zscore(topic.mention_counts, 0.8) # top 20 sources self.top_people = sorted( self.analysed_people.itervalues(), key=lambda s: s.mention_counts_total, reverse=True)[:20] # trends trending = sorted( self.analysed_people.itervalues(), key=lambda s: s.mention_counts_trend) # top 10 trending up, most trending first self.people_trending_up = [s for s in trending[-10:] if s.mention_counts_trend > self.TREND_UP] self.people_trending_up.reverse() # top 10 trending down, most trending first self.people_trending_down = [s for s in trending[:10] if s.mention_counts_trend < self.TREND_DOWN]
def _analyse_people_mentions(self): """ Do trend analysis on people mentions. """ mention_counts = self.mention_frequencies(self.people.keys()) self.analysed_people = {} for pid, person in self.people.iteritems(): mention = AnalysedMention() mention.person = person mention.mention_counts = mention_counts[pid] mention.mention_counts_total = sum(mention.mention_counts) self.analysed_people[pid] = mention # normalize by total counts per day totals = [0] * (self.days + 1) # first count per-day totals for topic in self.analysed_people.itervalues(): for i, n in enumerate(topic.mention_counts): totals[i] += n # normalize for topic in self.analysed_people.itervalues(): for i, n in enumerate(topic.mention_counts): if totals[i] == 0: topic.mention_counts[i] = 0 else: topic.mention_counts[i] = 100.0 * n / totals[i] # calculate trends for topic in self.analysed_people.itervalues(): topic.mention_counts_trend = moving_weighted_avg_zscore(topic.mention_counts, 0.8) # top 20 sources self.top_people = sorted(self.analysed_people.itervalues(), key=lambda s: s.mention_counts_total, reverse=True)[ :20 ] # trends trending = sorted(self.analysed_people.itervalues(), key=lambda s: s.mention_counts_trend) # top 10 trending up, most trending first self.people_trending_up = [s for s in trending[-10:] if s.mention_counts_trend > self.TREND_UP] self.people_trending_up.reverse() # top 10 trending down, most trending first self.people_trending_down = [s for s in trending[:10] if s.mention_counts_trend < self.TREND_DOWN]
def find_topics(self): """ Run clustering on these documents and identify common topics. We use latent Dirichlet allocation (LDA) to cluster the documents into an arbitrary number of clusters. We then find the strongest clusters and pull representative documents for each cluster. Clustering is based on the people and entities mentioned in the documents, rather than raw text. This is based on the assumption that Opencalais and AlchemyAPI have already done the work to identify pertinent things and concepts in the documents, so rely on those rather than on arbitrary words. The results are stored in `clustered_topics`. See also: https://github.com/ariddell/lda """ from sklearn.feature_extraction import DictVectorizer import numpy # TODO: factor people into cluster calcs self.clustered_topics = [] # load documents and their entities docs = Document.query\ .options(subqueryload('entities'), subqueryload('medium'))\ .filter(Document.id.in_(self.doc_ids))\ .all() if not docs: return # guess at the number of topics, between 1 and 50 n_topics = max(min(50, len(docs)/5), 1) # list of entity maps for each document, from entity name to occurrence count entities = [dict(('%s-%s' % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs] vec = DictVectorizer(sparse=True) # TODO: we should ideally use sparse, but it causes the lda library to fail entity_vector = vec.fit_transform(entities) features = numpy.array(vec.feature_names_) clusters, lda_model = self._run_lda(entity_vector, n_topics) del entity_vector del vec # for normalising histograms day_counts = self.date_histogram(d.published_at for d in docs) # generate topic info for i, clustering in clusters.iteritems(): # cluster is a list of (doc-index, score) pairs # sort each cluster to put top-scoring docs first # TODO: this isn't great, because scores for each document # for the same cluster can't really be compared. We # need a better way of doing this. clustering.sort(key=lambda p: p[1], reverse=True) cluster_docs = [docs[p[0]] for p in clustering] cluster = Cluster.find_or_create(docs=cluster_docs) # top 8 features for this cluster as (feature, weight) pairs indexes = numpy.argsort(lda_model.components_[i])[:-8:-1] cluster.features = zip(features[indexes], lda_model.components_[i][indexes]) # top 20 of each cluster are used to characterize the cluster best = clustering[0:20] cluster.score = numpy.median([p[1] for p in best]) # keep only the clusters with a score > self.topic_score_threshold if cluster.score <= self.topic_score_threshold: continue # score for this cluster as stars, from 0 to 3 cluster.stars = math.ceil((cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0)) # media counts media = dict(collections.Counter([d.medium for d in cluster_docs])) cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True) # publication dates cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs)) cluster.trend = moving_weighted_avg_zscore(cluster.histogram) cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts) self.clustered_topics.append(cluster) # sort clusters by size self.clustered_topics.sort(key=lambda t: t.score, reverse=True)
def analyse_people_sources(self): """ Do trend analysis on people. """ utterance_count = self.count_utterances(self.people.keys()) source_counts = self.source_frequencies(self.people.keys()) self.analysed_people = {} for pid, person in self.people.iteritems(): src = AnalysedSource() src.person = person src.utterance_count = utterance_count.get(src.person.id, 0) src.source_counts = source_counts[src.person.id] src.source_counts_total = sum(src.source_counts) self.analysed_people[pid] = src # normalize by total counts per day totals = [0] * (self.days + 1) # first count per-day totals for src in self.analysed_people.itervalues(): for i, n in enumerate(src.source_counts): totals[i] += n # normalize for src in self.analysed_people.itervalues(): for i, n in enumerate(src.source_counts): if totals[i] == 0: src.source_counts[i] = 0 else: src.source_counts[i] = 100.0 * n / totals[i] # calculate trends # normalise source counts if self.analysed_people: biggest = max(src.source_counts_total for src in self.analysed_people.itervalues()) for src in self.analysed_people.itervalues(): src.source_counts_trend = moving_weighted_avg_zscore( src.source_counts, 0.8) src.source_counts_normalised = src.source_counts_total * 1.0 / biggest # top 20 sources self.top_people = sorted(self.analysed_people.itervalues(), key=lambda s: s.source_counts_total, reverse=True)[:20] # trends trending = sorted(self.analysed_people.itervalues(), key=lambda s: s.source_counts_trend) # top 10 trending up, most trending first self.people_trending_up = [ s for s in trending[-10:] if s.source_counts_trend > self.TREND_UP ] self.people_trending_up.reverse() # top 10 trending down, most trending first self.people_trending_down = [ s for s in trending[:10] if s.source_counts_trend < self.TREND_DOWN ]
def find_topics(self): """ Run clustering on these documents and identify common topics. We use latent Dirichlet allocation (LDA) to cluster the documents into an arbitrary number of clusters. We then find the strongest clusters and pull representative documents for each cluster. Clustering is based on the people and entities mentioned in the documents, rather than raw text. This is based on the assumption that Opencalais and AlchemyAPI have already done the work to identify pertinent things and concepts in the documents, so rely on those rather than on arbitrary words. The results are stored in `clustered_topics`. See also: https://github.com/ariddell/lda """ from sklearn.feature_extraction import DictVectorizer import numpy # TODO: factor people into cluster calcs self.clustered_topics = [] # load documents and their entities docs = ( Document.query.options(subqueryload("entities"), subqueryload("medium")) .filter(Document.id.in_(self.doc_ids)) .all() ) if not docs: return # guess at the number of topics, between 1 and 50 n_topics = max(min(50, len(docs) / 5), 1) # list of entity maps for each document, from entity name to occurrence count entities = [ dict(("%s-%s" % (de.entity.group, de.entity.name), de.count or 1) for de in d.entities) for d in docs ] vec = DictVectorizer(sparse=True) # TODO: we should ideally use sparse, but it causes the lda library to fail entity_vector = vec.fit_transform(entities) features = numpy.array(vec.feature_names_) clusters, lda_model = self._run_lda(entity_vector, n_topics) del entity_vector del vec # for normalising histograms day_counts = self.date_histogram(d.published_at for d in docs) # generate topic info for i, clustering in clusters.iteritems(): # cluster is a list of (doc-index, score) pairs # sort each cluster to put top-scoring docs first # TODO: this isn't great, because scores for each document # for the same cluster can't really be compared. We # need a better way of doing this. clustering.sort(key=lambda p: p[1], reverse=True) cluster_docs = [docs[p[0]] for p in clustering] cluster = Cluster.find_or_create(docs=cluster_docs) # top 8 features for this cluster as (feature, weight) pairs indexes = numpy.argsort(lda_model.components_[i])[:-8:-1] cluster.features = zip(features[indexes], lda_model.components_[i][indexes]) # top 20 of each cluster are used to characterize the cluster best = clustering[0:20] cluster.score = numpy.median([p[1] for p in best]) # keep only the clusters with a score > self.topic_score_threshold if cluster.score <= self.topic_score_threshold: continue # score for this cluster as stars, from 0 to 3 cluster.stars = math.ceil( (cluster.score - self.topic_score_threshold) / ((1.0 - self.topic_score_threshold) / 3.0) ) # media counts media = dict(collections.Counter([d.medium for d in cluster_docs])) cluster.media_counts = sorted(media.items(), key=lambda p: p[1], reverse=True) # publication dates cluster.histogram = self.date_histogram((d.published_at for d in cluster_docs)) cluster.trend = moving_weighted_avg_zscore(cluster.histogram) cluster.histogram = self.normalise_histogram(cluster.histogram, day_counts) self.clustered_topics.append(cluster) # sort clusters by size self.clustered_topics.sort(key=lambda t: t.score, reverse=True)