Exemplo n.º 1
0
def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True,
    user_locations=True, ngrams=(1,2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True,
    ngram_rts=False, ngram_mts=False, ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location', None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                keep_hashtags=ngram_hashtags,
                                keep_mentions=ngram_mentions,
                                rts=ngram_rts,
                                mts=ngram_mts,
                                https=ngram_https,
                                stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams)
    return { key: _counter_to_series(counters[key], n) for key in counters }
Exemplo n.º 2
0
def _top_entities(collection,
                  n=10,
                  urls=True,
                  images=True,
                  hts=True,
                  mentions=True,
                  geolocation_names=True,
                  user_locations=True,
                  ngrams=(1, 2),
                  ngram_stopwords=[],
                  ngram_hashtags=True,
                  ngram_mentions=True,
                  ngram_rts=False,
                  ngram_mts=False,
                  ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][
                tweet['place']['full_name'] if 'place' in
                tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location',
                                                         None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                        keep_hashtags=ngram_hashtags,
                                        keep_mentions=ngram_mentions,
                                        rts=ngram_rts,
                                        mts=ngram_mts,
                                        https=ngram_https,
                                        stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e)
                                                          for e in grams)
    return {key: _counter_to_series(counters[key], n) for key in counters}
Exemplo n.º 3
0
def _top_hashtags(collection, n=10):
    counter = Counter([
        h for tweet in collection
        for h in [x.lower() for x in get_hashtags(tweet)]
    ])
    return _counter_to_series(counter, n)
Exemplo n.º 4
0
def _top_hashtags(collection, n=10):
    counter = Counter([h for tweet in collection for h in [x.lower() for x in get_hashtags(tweet)]])
    return _counter_to_series(counter, n)