def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True, user_locations=True, ngrams=(1,2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True, ngram_rts=False, ngram_mts=False, ngram_https=False): counters = defaultdict(Counter) for tweet in collection: if urls: for url in get_urls(tweet): counters['urls'][url] += 1 if images: for url in get_image_urls(tweet): counters['images'][url] += 1 if hts: for ht in get_hashtags(tweet): counters['hts'][ht] += 1 if mentions: for um in get_users_mentioned(tweet): counters['mentions'][um] += 1 if geolocation_names: counters['geolocation_names'][tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1 if user_locations: counters['user_locations'][tweet['user'].get('location', None)] += 1 if ngrams: tokens = get_cleaned_tokens(tweet["text"], keep_hashtags=ngram_hashtags, keep_mentions=ngram_mentions, rts=ngram_rts, mts=ngram_mts, https=ngram_https, stopwords=ngram_stopwords) for ngram in ngrams: grams = get_ngrams(tokens, ngram) counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams) return { key: _counter_to_series(counters[key], n) for key in counters }
def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True, user_locations=True, ngrams=(1, 2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True, ngram_rts=False, ngram_mts=False, ngram_https=False): counters = defaultdict(Counter) for tweet in collection: if urls: for url in get_urls(tweet): counters['urls'][url] += 1 if images: for url in get_image_urls(tweet): counters['images'][url] += 1 if hts: for ht in get_hashtags(tweet): counters['hts'][ht] += 1 if mentions: for um in get_users_mentioned(tweet): counters['mentions'][um] += 1 if geolocation_names: counters['geolocation_names'][ tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1 if user_locations: counters['user_locations'][tweet['user'].get('location', None)] += 1 if ngrams: tokens = get_cleaned_tokens(tweet["text"], keep_hashtags=ngram_hashtags, keep_mentions=ngram_mentions, rts=ngram_rts, mts=ngram_mts, https=ngram_https, stopwords=ngram_stopwords) for ngram in ngrams: grams = get_ngrams(tokens, ngram) counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams) return {key: _counter_to_series(counters[key], n) for key in counters}
def get_image_occurrences(tweets): """Takes a cursor (or any iterable) of tweets, returns a sorted list of (url, num_occurrences) pairs (sorted by num_occurrences, greatest first) of all images in the tweets given""" url_count = {} for tweet in tweets: if contains_image(tweet): urls = get_image_urls(tweet) for url in urls: if url in url_count: url_count[url] += 1 else: url_count[url] = 1 url_pairs = url_count.items() url_pairs.sort(key=lambda p: p[1], reverse=True) return url_pairs
def _top_images(collection, n=10): counter = Counter( [i for tweet in collection for i in get_image_urls(tweet)]) return _counter_to_series(counter, n)
def _top_images(collection, n=10): counter = Counter([i for tweet in collection for i in get_image_urls(tweet)]) return _counter_to_series(counter, n)