예제 #1
0
 def test_get_weibo_tweets_by_name(self):
     result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='嘻红豆', pages=1)
     for i in result_iterator:
         print(i)
     result_iterator2 = weibo_scraper.get_weibo_tweets_by_name(name='nicknameisnotexist', pages=1)
     for i in result_iterator2:
         print(i)
예제 #2
0
    def test_get_weibo_tweets_by_name(self):
        for i in weibo_scraper.get_weibo_tweets_by_name(name='Helixcs',
                                                        pages=1):
            print(i)

        for i in weibo_scraper.get_weibo_tweets_by_name(
                name='nicknameisnotexist', pages=1):
            self.assertIsNone(i)
예제 #3
0
from bs4 import BeautifulSoup

###

### PARAMETERS YOU SHOULD SET:
# Accounts you want to scrape

searchqueries = ['嘻红豆']

### DO NOT CHANGE ANYTHING BEYOND THIS LINE
twitterdata = []

for searchquery in searchqueries:
    print(searchquery)
    print('---')
    for tweet in get_weibo_tweets_by_name(name=searchquery, pages=None):
        soup = BeautifulSoup(tweet['mblog']['text'], 'html.parser')
        text = soup.getText()
        id = tweet['itemid']
        created = tweet['mblog']['created_at']

        print(text, 'posted on', created)

        twitterdata.append({
            'id': id,
            'text': text,
            'created': created,
            'author': searchquery
        })

print('Successfully harvested', len(twitterdata), 'posts')
예제 #4
0
 def test_get_containerid_from_second_profile(self):
     result_iterator = weibo_scraper.get_weibo_tweets_by_name(name='来去之间',
                                                              pages=1)
     for i in result_iterator:
         print(i)
     self.assertIsNotNone(result_iterator)
예제 #5
0
def get_tweets_weibo(name, output_file, pages=10):
    # Column names in the resulting csv file
    columns = [
        'Created at', 'Tweet', 'is_paid', 'num_reposts', 'num_comments',
        'num_likes'
    ]
    # Write these column names to the file first
    with open(output_file, 'a') as wf:
        writer = csv.writer(wf)
        writer.writerow(columns)

    # Parameters to keep track of progress
    num_processed = 0  # total number of tweets processed thus far
    start = time.time()  # time at which scraping started
    all_tweets = [
    ]  # contains every tweet scraped; we need to return this to retrieve total # of likes for the user

    print("Getting {}'s Weibo tweets...".format(name))

    # Get 'pages' number of pages of tweets of the user specified by name
    response = get_weibo_tweets_by_name(name, pages)

    # Extract the data corresponding to the fields specified in columns
    for tweet in response:
        # Only get the tweet if it isn't retweeted (if it's retweeted, it means someone else wrote the tweet)
        if 'retweeted_status' not in tweet['mblog'].keys():
            all_tweets.append([
                tweet['mblog']['created_at'],
                cleanhtml(tweet['mblog']['text']), tweet['mblog']['is_paid'],
                tweet['mblog']['reposts_count'],
                tweet['mblog']['comments_count'],
                tweet['mblog']['attitudes_count']
            ])

            num_processed += 1

        # Once our batch size reaches 100, we write it to the output file
        if num_processed % 100 == 0:
            print("Writing items {} to {} to {}...".format(
                num_processed - 99, num_processed, output_file))
            with open(output_file, 'a') as wf:
                writer = csv.writer(wf)
                for item in all_tweets[-100:]:
                    writer.writerow(item)
            print("Done writing!")
            print("{} tweets Processed: {}".format(num_processed,
                                                   datetime.datetime.now()))

    # Ensure that if there are any remaining tweets, we write it to the output file
    if num_processed % 100 > 0:
        print("Writing remaining {} items to {}...".format(
            num_processed % 100, output_file))
        with open(output_file, 'a') as wf:
            writer = csv.writer(wf)
            for tweet in all_tweets[-(num_processed % 100):]:
                writer.writerow(tweet)
        print("Done writing!")

    end = time.time()
    print("Successfully retrieved {} of {}'s Weibo tweets in {} seconds!\n".
          format(num_processed, name, end - start))

    return columns, all_tweets
예제 #6
0
import sys
import os
import weibo_scraper
from weibo_scraper import set_debug
from weibo_base.weibo_component import exist_get_uid, get_tweet_containerid
from weibo_base.weibo_util import Timer
import logging

if __name__ == '__main__':
    set_debug()
    uid = exist_get_uid(name='嘻红豆')
    print(uid)
    containerid = get_tweet_containerid(uid=uid.get('uid'))
    print(containerid)

    result = weibo_scraper.get_weibo_tweets_by_name(name="嘻红豆", pages=1)
    for tweet in result:
        print(tweet)
    result = weibo_scraper.get_weibo_tweets(tweet_container_id=containerid,
                                            pages=1)
    for tweet in result:
        print(tweet)

    wp = weibo_scraper.get_weibo_profile(name='嘻红豆')
    print(wp.raw_user_response)

    hotwords = weibo_scraper.get_realtime_hotwords()
    for hw in hotwords:
        print(str(hw))
    pass
    wt = Timer(name="realtime_hotword_timer",