Python get_image_urls示例，smappPy.entities.get_image_urls Python示例

示例#1

0

显示文件

文件： counter_functions.py 项目： IWhisper/smapp-toolkit

def _top_entities(collection, n=10, urls=True, images=True, hts=True, mentions=True, geolocation_names=True,
    user_locations=True, ngrams=(1,2), ngram_stopwords=[], ngram_hashtags=True, ngram_mentions=True,
    ngram_rts=False, ngram_mts=False, ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][tweet['place']['full_name'] if 'place' in tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location', None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                keep_hashtags=ngram_hashtags,
                                keep_mentions=ngram_mentions,
                                rts=ngram_rts,
                                mts=ngram_mts,
                                https=ngram_https,
                                stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e) for e in grams)
    return { key: _counter_to_series(counters[key], n) for key in counters }

示例#2

0

显示文件

文件： counter_functions.py 项目： kbenoit/smapp-toolkit

def _top_entities(collection,
                  n=10,
                  urls=True,
                  images=True,
                  hts=True,
                  mentions=True,
                  geolocation_names=True,
                  user_locations=True,
                  ngrams=(1, 2),
                  ngram_stopwords=[],
                  ngram_hashtags=True,
                  ngram_mentions=True,
                  ngram_rts=False,
                  ngram_mts=False,
                  ngram_https=False):
    counters = defaultdict(Counter)
    for tweet in collection:
        if urls:
            for url in get_urls(tweet):
                counters['urls'][url] += 1
        if images:
            for url in get_image_urls(tweet):
                counters['images'][url] += 1
        if hts:
            for ht in get_hashtags(tweet):
                counters['hts'][ht] += 1
        if mentions:
            for um in get_users_mentioned(tweet):
                counters['mentions'][um] += 1
        if geolocation_names:
            counters['geolocation_names'][
                tweet['place']['full_name'] if 'place' in
                tweet and tweet['place'] is not None else None] += 1
        if user_locations:
            counters['user_locations'][tweet['user'].get('location',
                                                         None)] += 1
        if ngrams:
            tokens = get_cleaned_tokens(tweet["text"],
                                        keep_hashtags=ngram_hashtags,
                                        keep_mentions=ngram_mentions,
                                        rts=ngram_rts,
                                        mts=ngram_mts,
                                        https=ngram_https,
                                        stopwords=ngram_stopwords)
            for ngram in ngrams:
                grams = get_ngrams(tokens, ngram)
                counters['{}-grams'.format(ngram)].update(' '.join(e)
                                                          for e in grams)
    return {key: _counter_to_series(counters[key], n) for key in counters}

示例#3

0

显示文件

文件： image_util.py 项目： SONEINT/smappPy

def get_image_occurrences(tweets):
    """Takes a cursor (or any iterable) of tweets, returns a sorted list of
    (url, num_occurrences) pairs (sorted by num_occurrences, greatest first) of all 
    images in the tweets given"""
    url_count = {}
    for tweet in tweets:
        if contains_image(tweet):
            urls = get_image_urls(tweet)
            for url in urls:
                if url in url_count:
                    url_count[url] += 1
                else:
                    url_count[url] = 1
    url_pairs = url_count.items()
    url_pairs.sort(key=lambda p: p[1], reverse=True)
    return url_pairs

示例#4

0

显示文件

def get_image_occurrences(tweets):
    """Takes a cursor (or any iterable) of tweets, returns a sorted list of
    (url, num_occurrences) pairs (sorted by num_occurrences, greatest first) of all 
    images in the tweets given"""
    url_count = {}
    for tweet in tweets:
        if contains_image(tweet):
            urls = get_image_urls(tweet)
            for url in urls:
                if url in url_count:
                    url_count[url] += 1
                else:
                    url_count[url] = 1
    url_pairs = url_count.items()
    url_pairs.sort(key=lambda p: p[1], reverse=True)
    return url_pairs

示例#5

0

显示文件

文件： counter_functions.py 项目： kbenoit/smapp-toolkit

def _top_images(collection, n=10):
    counter = Counter(
        [i for tweet in collection for i in get_image_urls(tweet)])
    return _counter_to_series(counter, n)

示例#6

0

显示文件

文件： counter_functions.py 项目： IWhisper/smapp-toolkit

def _top_images(collection, n=10):
    counter = Counter([i for tweet in collection for i in get_image_urls(tweet)])
    return _counter_to_series(counter, n)