def test_get_weibo_tweets_by_name(self): result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='嘻红豆', pages=1) for i in result_iterator: print(i) result_iterator2 = weibo_scraper.get_weibo_tweets_by_name(name='nicknameisnotexist', pages=1) for i in result_iterator2: print(i)
def test_get_weibo_tweets_by_name(self): for i in weibo_scraper.get_weibo_tweets_by_name(name='Helixcs', pages=1): print(i) for i in weibo_scraper.get_weibo_tweets_by_name( name='nicknameisnotexist', pages=1): self.assertIsNone(i)
from bs4 import BeautifulSoup ### ### PARAMETERS YOU SHOULD SET: # Accounts you want to scrape searchqueries = ['嘻红豆'] ### DO NOT CHANGE ANYTHING BEYOND THIS LINE twitterdata = [] for searchquery in searchqueries: print(searchquery) print('---') for tweet in get_weibo_tweets_by_name(name=searchquery, pages=None): soup = BeautifulSoup(tweet['mblog']['text'], 'html.parser') text = soup.getText() id = tweet['itemid'] created = tweet['mblog']['created_at'] print(text, 'posted on', created) twitterdata.append({ 'id': id, 'text': text, 'created': created, 'author': searchquery }) print('Successfully harvested', len(twitterdata), 'posts')
def test_get_containerid_from_second_profile(self): result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='来去之间', pages=1) for i in result_iterator: print(i) self.assertIsNotNone(result_iterator)
def get_tweets_weibo(name, output_file, pages=10): # Column names in the resulting csv file columns = [ 'Created at', 'Tweet', 'is_paid', 'num_reposts', 'num_comments', 'num_likes' ] # Write these column names to the file first with open(output_file, 'a') as wf: writer = csv.writer(wf) writer.writerow(columns) # Parameters to keep track of progress num_processed = 0 # total number of tweets processed thus far start = time.time() # time at which scraping started all_tweets = [ ] # contains every tweet scraped; we need to return this to retrieve total # of likes for the user print("Getting {}'s Weibo tweets...".format(name)) # Get 'pages' number of pages of tweets of the user specified by name response = get_weibo_tweets_by_name(name, pages) # Extract the data corresponding to the fields specified in columns for tweet in response: # Only get the tweet if it isn't retweeted (if it's retweeted, it means someone else wrote the tweet) if 'retweeted_status' not in tweet['mblog'].keys(): all_tweets.append([ tweet['mblog']['created_at'], cleanhtml(tweet['mblog']['text']), tweet['mblog']['is_paid'], tweet['mblog']['reposts_count'], tweet['mblog']['comments_count'], tweet['mblog']['attitudes_count'] ]) num_processed += 1 # Once our batch size reaches 100, we write it to the output file if num_processed % 100 == 0: print("Writing items {} to {} to {}...".format( num_processed - 99, num_processed, output_file)) with open(output_file, 'a') as wf: writer = csv.writer(wf) for item in all_tweets[-100:]: writer.writerow(item) print("Done writing!") print("{} tweets Processed: {}".format(num_processed, datetime.datetime.now())) # Ensure that if there are any remaining tweets, we write it to the output file if num_processed % 100 > 0: print("Writing remaining {} items to {}...".format( num_processed % 100, output_file)) with open(output_file, 'a') as wf: writer = csv.writer(wf) for tweet in all_tweets[-(num_processed % 100):]: writer.writerow(tweet) print("Done writing!") end = time.time() print("Successfully retrieved {} of {}'s Weibo tweets in {} seconds!\n". format(num_processed, name, end - start)) return columns, all_tweets
import sys import os import weibo_scraper from weibo_scraper import set_debug from weibo_base.weibo_component import exist_get_uid, get_tweet_containerid from weibo_base.weibo_util import Timer import logging if __name__ == '__main__': set_debug() uid = exist_get_uid(name='嘻红豆') print(uid) containerid = get_tweet_containerid(uid=uid.get('uid')) print(containerid) result = weibo_scraper.get_weibo_tweets_by_name(name="嘻红豆", pages=1) for tweet in result: print(tweet) result = weibo_scraper.get_weibo_tweets(tweet_container_id=containerid, pages=1) for tweet in result: print(tweet) wp = weibo_scraper.get_weibo_profile(name='嘻红豆') print(wp.raw_user_response) hotwords = weibo_scraper.get_realtime_hotwords() for hw in hotwords: print(str(hw)) pass wt = Timer(name="realtime_hotword_timer",