Exemplo n.º 1
0
def crawl_tweets_for_event(event_id):
    r = Redis()
    p = HTMLParser()

    total_tweets = 0

    event_title = r.get("festival:%s:title" % event_id).decode("utf-8", errors="ignore")
    event_title = strip_accents(event_title)
    event_title = p.unescape(event_title)
    event_title = remove_stopwords(event_title)

    artists = r.get("festival:%s:artists" % event_id)

    for k, v in eval(artists).items():
        if type(v) == list:
            for artist in v:
                print F, "searching tweets for %s %s" % (k, artist)
                total_tweets += search_term(artist)
        elif type(v) == str:
            print F, "searching tweets for %s %s" % (k, v)
            total_tweets += search_term(v)

    r.incr("festival:%s:crawled_times" % event_id)

    print F, "searching tweets for festival title: %s" % event_title
    total_tweets += search_term(event_title, event_id)  # newsid
    print F, "total tweets: %d" % total_tweets
Exemplo n.º 2
0
def enrich_news(redis):
    keys = redis.keys('news:*:id')

    if len(keys) == 0:
        return

    news_tweets = []
    pages = []
    for key in keys:
        locale = redis.get('news:%s:locale' % key.split(':')[1])

        if locale == 'en_us':
            lang = 'english'
        else:
            lang = 'spanish'

        id = key.split(':')[1]
        terms = get_search_terms_news(redis, id, lang)

        for term in terms:
            tweets, pages = twitter.search_term(term)

            for tweet in tweets:
                tweet.event_id = id

            news_tweets.extend(tweets)
            pages.extend(pages)

    print tag, "got", len(news_tweets), 'tweets for news'
    return (news_tweets, pages)
Exemplo n.º 3
0
def enrich_festivals(redis):
    import datetime

    keys = redis.keys('festival:*:startDate')
    to_search_keys = []

    # solo buscar en los festivales que estan pasando ahora
    for key in keys:
        startDate = redis.get(key)
        id = key.split(':')[1]

        startDate = datetime.datetime.strptime(startDate, '%a, %d %b %Y %H:%M:%S')
        if datetime.datetime.today() >= startDate:
            to_search_keys.append(key)

    if len(keys) == 0:
        return

    festivals_tweets = []
    pages = []
    for key in to_search_keys:
        id = key.split(':')[1]
        terms = get_search_terms_festivals(redis, id)

        for term in terms:
            tweets, pages = twitter.search_term(term)

            for tweet in tweets:
                tweet.event_id = id

            festivals_tweets.extend(tweets)
            pages.extend(pages)

    print tag, "got", len(festivals_tweets), 'tweets for festivals'
    return (festivals_tweets, pages)
Exemplo n.º 4
0
def crawl_week_later():
	gn = GoogleNews()
	r = Redis()
	
	all_news = r.keys('page:*:title')

	i = 1
	total = 0
	print F, "total news: %d" % len(all_news)	
	for key_news_title in all_news:
		newsid = key_news_title.split(':')[1] 

		if r.get('page:%s:crawled_week' % newsid) is None and r.get('page:%s:crawled_day' % newsid) is not None:
			i += 1

			if r.get('page:%s:locale' % newsid) == 'es_cl':
				lang = 'spanish'
			else: 
				lang = 'english'

			news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore')))
			news_title = remove_stopwords(news_title_stopwords, lang=lang)
			print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords			
			print F, "searching tweets for news (w/o sw): \"%s\"" % news_title

			print F, "searching tweets for news: %s" % news_title 
			total += search_term(news_title, newsid) # search by title
	print F, "total news searched: %d" % i
	print F, "total tweets crawled: %d" % total
Exemplo n.º 5
0
def enrich_event(redis_key):
    tag = '[events_enricher]'
    redis = Redis()
    queries = list(set(redis.lrange(redis_key, 0, -1)))
    tweets = []

    for query in queries:
        #print tag, 'searching "%s"' % query.decode('utf-8', errors='ignore')
        tweets.extend(search_term(query.decode('utf-8', errors='ignore')))

    print tag, 'got', len(tweets), 'tweets'
    event_id = redis_key.split(':')[1]
    save_tweets(tweets, event_id)
Exemplo n.º 6
0
def crawl_current_day():
	gn = GoogleNews()
	r = Redis()

	gn.get_topnews()

	all_news = r.keys('page:*:title')
	p = HTMLParser()

	i = 1
	total = 0
	print F, "total pages: %d" % len(all_news)
	for key_news_title in all_news:
		newsid = key_news_title.split(':')[1] 

		# only interested in news here
		if r.get('page:%s:type' % newsid) != 'news':
			continue

		# and pages not already crawled in its first day
		if r.get('page:%s:crawled_day' % newsid) is None:			
			i += 1

			# lang for stopwords remove
			if r.get('page:%s:locale' % newsid) == 'es_cl':
				lang = 'spanish'
			else: 
				lang = 'english'

			news_title_stopwords = p.unescape(strip_accents(r.get(key_news_title).decode('utf-8', errors='ignore')))
			news_title = remove_stopwords(news_title_stopwords, lang=lang)

			print F, "searching tweets for news (w/ sw): \"%s\"" % news_title_stopwords			
			print F, "searching tweets for news (w/o sw): \"%s\"" % news_title

			# mark its news' first day as searched
			r.incr('page:%s:crawled_day' % newsid)

			# search by title in twitter
			total += search_term(news_title, newsid) 

	print F, "total news searched: %d" % i
	print F, "total tweets crawled: %d" % total