def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True, user_locations=True, ngrams=(1,2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True, ngram_rts=False, ngram_mts=False, ngram_https=False): counters = defaultdict(Counter) for tweet in collection: if urls: for url in get_urls(tweet): counters['urls'][url] += 1 if images: for url in get_image_urls(tweet): counters['images'][url] += 1 if hts: for ht in get_hashtags(tweet): counters['hts'][ht] += 1 if mentions: for um in get_users_mentioned(tweet): counters['mentions'][um] += 1 if geolocation_names: counters['geolocation_names'][tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1 if user_locations: counters['user_locations'][tweet['user'].get('location', None)] += 1 if ngrams: tokens = get_cleaned_tokens(tweet["text"], keep_hashtags=ngram_hashtags, keep_mentions=ngram_mentions, rts=ngram_rts, mts=ngram_mts, https=ngram_https, stopwords=ngram_stopwords) for ngram in ngrams: grams = get_ngrams(tokens, ngram) counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams) return { key: _counter_to_series(counters[key], n) for key in counters }
def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True, user_locations=True, ngrams=(1, 2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True, ngram_rts=False, ngram_mts=False, ngram_https=False): counters = defaultdict(Counter) for tweet in collection: if urls: for url in get_urls(tweet): counters['urls'][url] += 1 if images: for url in get_image_urls(tweet): counters['images'][url] += 1 if hts: for ht in get_hashtags(tweet): counters['hts'][ht] += 1 if mentions: for um in get_users_mentioned(tweet): counters['mentions'][um] += 1 if geolocation_names: counters['geolocation_names'][ tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1 if user_locations: counters['user_locations'][tweet['user'].get('location', None)] += 1 if ngrams: tokens = get_cleaned_tokens(tweet["text"], keep_hashtags=ngram_hashtags, keep_mentions=ngram_mentions, rts=ngram_rts, mts=ngram_mts, https=ngram_https, stopwords=ngram_stopwords) for ngram in ngrams: grams = get_ngrams(tokens, ngram) counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams) return {key: _counter_to_series(counters[key], n) for key in counters}
def _top_mentions(collection, n=10): counter = Counter( [m for tweet in collection for m in get_users_mentioned(tweet)]) return _counter_to_series(counter, n)
def _top_mentions(collection, n=10): counter = Counter([m for tweet in collection for m in get_users_mentioned(tweet)]) return _counter_to_series(counter, n)