def get_tweets_in(location_id=my.HBK_LOCATION_ID): # get all tweets in hollenbeck area import sys sys.path.append("/home/gambit/collector/gambit2/") from django.core.management import setup_environ from gambit import settings setup_environ(settings) from scraper.models import Location, Tweet print 'Querying database...' loc = Location.objects.get(id=location_id) tweets = Tweet.filter(geo__within=loc.polygon) if not os.path.exists('data/' + my.DATA_FOLDER): os.makedirs('data/' + my.DATA_FOLDER) with open('data/' + my.DATA_FOLDER + my.HBK_TWEET_LOC_FILE, 'wb') as fp: csv_writer = csv.writer(fp, delimiter=',') for tweet in tweets: csv_writer.writerow([tweet.user_id, tweet.geo[0], tweet.geo[1]]) print 'Done! Fetched ' + str(tweets.count()) + ' entries.'
def scrape_target(long_run=False): try: latest_tweet = Tweet.objects.latest('twitter_published_date') latest_tweet_id = latest_tweet.tweet_id except ObjectDoesNotExist: latest_tweet_id = None if not long_run: timeline = api.user_timeline(id=TARGET_HANDLE, since_id=latest_tweet_id, count=200) else: timeline = api.user_timeline(id=TARGET_HANDLE, max_id=latest_tweet_id, count=200) count = 0 check = 0 while timeline: for tweet in timeline: count += 1 tweet_id = tweet['id_str'] try: match = Tweet.objects.get(tweet_id=tweet_id) except ObjectDoesNotExist: pass else: # This is a really ugly hack if long_run: check += 1 if check > 1: timeline = [] break else: check = 0 continue else: timeline = [] break hashtags = tweet['entities']['hashtags'] user_mentions = tweet['entities']['user_mentions'] pubdate = datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y') pubdate = pubdate.replace(tzinfo=tz) date_added = datetime.now(pytz.utc) tweet_url = 'https://twitter.com/' + TARGET_HANDLE + '/status/' + str(tweet_id) tweet_text = tweet['text'] text_first_word, tweet_text_clean = tweet_cleaner(tweet) if text_first_word == 'RT': continue retweet_count = tweet['retweet_count'] favourite_count = tweet['favorite_count'] in_reply_to_screen_name = tweet['in_reply_to_screen_name'] in_reply_to_status_id = tweet['in_reply_to_status_id_str'] in_reply_to_user_id = tweet['in_reply_to_user_id_str'] is_quote_status = tweet['is_quote_status'] source = tweet['source'] json_dump = json.dumps(tweet) if in_reply_to_screen_name: try: existing_user = OtherUser.objects.get(handle=in_reply_to_screen_name) except ObjectDoesNotExist: new_user = OtherUser(handle=in_reply_to_screen_name, twitter_id=in_reply_to_user_id) new_user.save() reply_user = new_user else: reply_user = existing_user is_reply = True else: reply_user = None is_reply = False if not is_quote_status and text_first_word is not None: try: existing_firstword = FirstWord.objects.get(word=text_first_word) tweet_first_word = existing_firstword except ObjectDoesNotExist: new_firstword = FirstWord(word=text_first_word) new_firstword.save() tweet_first_word = new_firstword new_tweet = Tweet(text=tweet_text, clean_text=tweet_text_clean, tweet_id=tweet_id, date_added=date_added, twitter_published_date=pubdate, tweet_url=tweet_url, retweet_count=retweet_count, favourite_count=favourite_count, is_reply=is_reply, reply_user=reply_user, in_reply_to_status_id=in_reply_to_status_id, is_quote_status=is_quote_status, quoted_status_id=quoted_status_id, first_word=tweet_first_word, source=source, json_dump=json_dump) new_tweet.save() if not long_run: corpus = Corpus.objects.all()[0] corpus.content = corpus.content + new_tweet.clean_text + ' ' corpus.save() if hashtags: for hashtag in hashtags: try: hashtag_obj = HashTag.objects.get(text=hashtag['text']) except ObjectDoesNotExist: hashtag_obj = HashTag(text=hashtag['text']) hashtag_obj.save() hashtag_obj.related_tweet.add(new_tweet) if user_mentions: for user_mention in user_mentions: try: user_mention_obj = OtherUser.objects.get(handle=user_mention['screen_name']) except ObjectDoesNotExist: user_mention_obj = OtherUser(handle=user_mention['screen_name'], twitter_id=user_mention['id_str']) user_mention_obj.save() user_mention_obj.related_tweet.add(new_tweet) else: timeline = api.user_timeline(id=TARGET_HANDLE, max_id=tweet_id, count=200)