def get_followers(name):

    # open spreadsheet and add column heads
    with open_csv_w('%s_followerlist.csv' % name) as f:
        writer = csv.writer(f)
        writer.writerow([
            "id", "screen_name", "display_name", "bio", "followers_count",
            "following_count", "acct_created", "location"
        ])

    # friends_ids returns an array of the ids of all the people the user follows
    follower_ids = api.followers_ids(screen_name=name)

    # cycle through every id in the array of people that the user follows and gather information for each one
    for follower_id in follower_ids:
        user = None
        while user is None:
            try:
                user = api.get_user(follower_id)
            except tweepy.error.RateLimitError:
                print("sleeping for a minute")
                time.sleep(60)

        # write the csv
        with open_csv_w('%s_followerlist.csv' % name) as f:
            writer = csv.writer(f)
            writer.writerow([
                follower_id,
                user.screen_name.encode('utf-8'),
                user.name.encode('utf-8'),
                user.description.encode('utf-8'), user.followers_count,
                user.friends_count, user.created_at,
                user.location.encode('utf-8')
            ])
            print(user.screen_name.encode('utf-8'))
def proof_facebook_ids(fb_id):

    # constructing the url for an http request for the Facebook API
    base = 'https://graph.facebook.com/v2.9'
    page_id = "/%s/" % fb_id
    extra_parameters = '?access_token=%s' % access_token
    url = base + page_id + extra_parameters

    # variables for spreadsheet
    validity = ""
    processed_id = fb_id
    error_message = ""

    # retrieve data
    resp = custom_request(url)
    if resp:
        validity = True
        data = json.loads(custom_request(url))
        error_message = None
    else:
        validity = False
        r = requests.get(url)
        response = json.loads(r.text)
        error_message = response["error"]["message"]

    # prep data for csv
    id_data = [validity, processed_id, error_message]

    #write the csv
    with open_csv_w('../output/cleaned_ids.csv') as f:
        writer = csv.writer(f)
        writer.writerow(id_data)
Exemplo n.º 3
0
def get_all_tweets(screen_name):
    #Twitter only allows access to a users most recent 3240 tweets with this method

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET)
    auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET)
    api = tweepy.API(auth)

    #initialize a list to hold all the tweepy Tweets
    alltweets = []

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    #save most recent tweets
    alltweets.extend(new_tweets)

    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print("getting tweets before %s" % (oldest))

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name,
                                       count=200,
                                       max_id=oldest)

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

    #transform the tweepy tweets into a 2D array that will populate the csv	| you can comment out data you don't need
    outtweets = [[
        tweet.id_str,
        tweet.created_at,
        tweet.favorite_count,
        tweet.retweet_count,
        tweet.retweeted,
        tweet.source.encode("utf-8"),
        tweet.text.encode("utf-8"),
    ] for tweet in alltweets]

    #write the csv
    with open_csv_w('../output/%s_tweets.csv' % screen_name) as f:
        writer = csv.writer(f)
        writer.writerow([
            "id", "created_at", "favorites", "retweets", "retweeted", "source",
            "text"
        ])
        writer.writerows(outtweets)

    pass
def get_userinfo(name):
    #set user to be the screen_name
    user = api.get_user(screen_name=name)

    # create row
    userinfo = [
        user.id, user.screen_name, user.name, user.description,
        user.followers_count, user.friends_count, user.favourites_count,
        user.statuses_count, user.created_at, user.lang, user.location,
        user.protected, user.verified
    ]
    print(userinfo)

    # write the csv
    with open_csv_w('../output/userinfo.csv') as f:
        writer = csv.writer(f)
        writer.writerows([userinfo])
    pass
Exemplo n.º 5
0
def scrapeFacebookPageFeedStatus(group_id, access_token):
    with open_csv_w('../output/%s_facebook_statuses.csv' % group_id) as file:
        w = csv.writer(file)
        w.writerow([
            "status_id", "status_message", "status_author", "link_name",
            "status_type", "status_link", "status_published", "num_reactions",
            "num_comments", "num_shares", "num_likes", "num_loves", "num_wows",
            "num_hahas", "num_sads", "num_angrys"
        ])

        has_next_page = True
        num_processed = 0  # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()

        print("Scraping %s Facebook Page: %s\n" % \
                (group_id, scrape_starttime))

        statuses = getFacebookPageFeedData(group_id, access_token, 100)

        while has_next_page:
            for status in statuses['data']:

                # Ensure it is a status with the expected metadata
                if 'reactions' in status:
                    w.writerow(processFacebookPageFeedStatus(status, \
                                                            access_token))

                # output progress occasionally to make sure code is not
                # stalling
                num_processed += 1
                if num_processed % 100 == 0:
                    print("%s Statuses Processed: %s") % (
                        num_processed, datetime.datetime.now())

            # if there is no next page, we're done.
            if 'paging' in statuses.keys():
                statuses = json.loads(request_until_succeed(\
                        statuses['paging']['next']))
            else:
                has_next_page = False


        print("\nDone!\n%s Statuses Processed in %s") % \
                (num_processed, datetime.datetime.now() - scrape_starttime)
Exemplo n.º 6
0
def get_userinfo(name):
    #set user to be the screen_name
    user = api.get_user(screen_name=name)

    # create row
    userinfo = [
        name.encode('utf-8'),
        user.name.encode('utf-8'),
        user.description.encode('utf-8'), user.followers_count,
        user.friends_count, user.created_at,
        user.location.encode('utf-8')
    ]
    print(userinfo)

    # write the csv
    with open_csv_w('userinfo.csv') as f:
        writer = csv.writer(f)
        writer.writerows([userinfo])
    pass
Exemplo n.º 7
0
        try:
            yield cursor.next()
        except tweepy.error.TweepError:
            print("waiting 15 minutes for Twitter to let me get more tweets")
            time.sleep(15 * 60)


# counter for console messages
counter = 0

# search terms
# find a full list of conventions here: https://dev.twitter.com/rest/public/search#query-operators
searchterm = "#MuellerReport"

# Open/Create a file to append data
csvFile = open_csv_w('../output/%s-result.csv' % searchterm)
#Use csv Writer
csvWriter = csv.writer(csvFile)
# these are the headers of your csv
csvWriter.writerow(
    ["id", "authorname", "created_at", "favorites", "retweets", "text"])

# loop to put tweets into the csv
for tweet in limit_handled(
        tweepy.Cursor(
            api.search,
            q=searchterm,
            # note that Twitter only makes available a sample of tweets from the last 7 days: https://dev.twitter.com/rest/public/search
            # point of time you want the search to start
            since="2019-01-10",
            # point of time you want the search to end
Exemplo n.º 8
0
                    'channelId': channelId,
                    'channelTitle': channelTitle
                }
                rows.append(video_data_row)
            # csv_writer.writerow(video_data_row)
        else:
            print('no more posts!')
            has_another_page = False


if __name__ == '__main__':
    for date_string in date_strings:
        gatherAPIdata(date_string['start_date'], date_string['end_date'])

        # make a new csv into which we will write all the rows
    with open_csv_w('../output/youtube-video-search-results.csv') as csvfile:
        # these are the header names:
        fieldnames = [
            'publishedAt', 'title', 'description', 'kind', 'videoID',
            'channelId', 'channelTitle'
        ]
        # this creates your csv
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # this writes in the first row, which are the headers
        writer.writeheader()

        # this loops through your rows (the array you set at the beginning and have updated throughtout)
        for row in rows:
            # this takes each row and writes it into your csv
            writer.writerow(row)
Exemplo n.º 9
0
import tweepy
from utils import open_csv_w

# import authentication credentials
from secrets import TWITTER_C_KEY, TWITTER_C_SECRET, TWITTER_A_KEY, TWITTER_A_SECRET

#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET)
auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET)
api = tweepy.API(auth)

#Returns most recent 20 status/post of user/user's friends
home_timeline = api.home_timeline()
with open_csv_w('home_timeline.txt') as f:
    for tweet in home_timeline:
        print(tweet.text, file=f)

#Returns the 20 most recent mentions, including retweets.
mentions_timeline = api.mentions_timeline()
with open_csv_w('mentions_timeline.txt') as f:
    for tweet in mentions_timeline:
        print(tweet.text, file=f)

#returns 20 most recent posts of authenticating user
my_timeline = api.user_timeline()
with open_csv_w('my_timeline.txt') as f:
    for tweet in my_timeline:
        print(tweet.text, file=f)

#return 20 most recent post of calling user_id
user_timeline = api.user_timeline(user_id=1153916176453513216)
Exemplo n.º 10
0
    if resp:
        validity = True
        data = json.loads(custom_request(url))
        error_message = None
    else:
        validity = False
        r = requests.get(url)
        response = json.loads(r.text)
        error_message = response["error"]["message"]

    # prep data for csv
    id_data = [validity, processed_id, error_message]

    #write the csv
    with open_csv_w('../output/cleaned_ids.csv') as f:
        writer = csv.writer(f)
        writer.writerow(id_data)


# run the proofer
if __name__ == '__main__':
    # set array of IDs you want to proof
    fb_ids = []

    with open_csv_w('cleaned_ids.csv') as f:
        writer = csv.writer(f)
        writer.writerow(["id", "valid", "error"])
    # iterate through all the
    for fb_id in fb_ids:
        proof_facebook_ids(fb_id)
Exemplo n.º 11
0
                'favoriteCount': favoriteCount,
                'commentCount': commentCount,
                'topicCategories': topicCategories
            }

        rows.append(row)
    else:
        print(video_id + " is not a valid ID")


if __name__ == '__main__':
    for video_id in video_ids:
        get_video_data(video_id)

        # make a new csv into which we will write all the rows
    with open_csv_w('../output/youtube-video-information.csv') as csvfile:
        # these are the header names:
        fieldnames = [
            'youtube_id', 'publishedAt', 'channelId', 'channelTitle', 'title',
            'description', 'tags', 'viewCount', 'likeCount', 'dislikeCount',
            'favoriteCount', 'commentCount', 'topicCategories'
        ]
        # this creates your csv
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # this writes in the first row, which are the headers
        writer.writeheader()

        # this loops through your rows (the array you set at the beginning and have updated throughtout)
        for row in rows:
            # this takes each row and writes it into your csv
            writer.writerow(row)
Exemplo n.º 12
0
                'viewCount': viewCount,
                'subscriberCount': subscriberCount,
                'videoCount': videoCount,
                'commentCount': commentCount
            }
        rows.append(row)
    else:
        print(video_id + " is not a valid ID")


if __name__ == '__main__':
    for channel_id in channel_ids:
        get_channel_data(channel_id)

    # make a new csv into which we will write all the rows
    with open_csv_w('../output/youtube-channel-information.csv') as csvfile:
        # these are the header names:
        fieldnames = [
            'youtube_id', 'publishedAt', 'title', 'description', 'viewCount',
            'subscriberCount', 'videoCount', 'commentCount'
        ]
        # this creates your csv
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        # this writes in the first row, which are the headers
        writer.writeheader()

        # this loops through your rows (the array you set at the beginning and have updated throughtout)
        for row in rows:
            # this takes each row and writes it into your csv
            writer.writerow(row)
Exemplo n.º 13
0
        try:
            yield cursor.next()
        except tweepy.error.TweepError:
            print("waiting 15 minutes for Twitter to let me get more tweets")
            time.sleep(15 * 60)


# counter for console messages
counter = 0

# search terms
# find a full list of conventions here: https://dev.twitter.com/rest/public/search#query-operators
searchterm = "\"Ben Smith\""

# Open/Create a file to append data
csvFile = open_csv_w('%s-result.csv' % searchterm)
#Use csv Writer
csvWriter = csv.writer(csvFile)
# these are the headers of your csv
csvWriter.writerow(
    ["id", "authorname", "created_at", "favorites", "retweets", "text"])

# loop to put tweets into the csv
for tweet in limit_handled(
        tweepy.Cursor(
            api.search,
            q=searchterm,
            # note that Twitter only makes available a sample of tweets from the last 7 days: https://dev.twitter.com/rest/public/search
            # point of time you want the search to start
            since="2017-01-10",
            # point of time you want the search to end
Exemplo n.º 14
0
def scrapeFacebookPageFeedComments(page_id, access_token):
    # with open('%s_facebook_comments.csv' % file_id, 'wb') as file:
    with open_csv_w('../output/%s_facebook_comments.csv' % file_id) as file:
        w = csv.writer(file)
        w.writerow([
            "comment_id", "status_id", "parent_id", "comment_message",
            "comment_author", "comment_published", "comment_likes"
        ])

        num_processed = 0  # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()

        print("Scraping %s Comments From Posts: %s\n" % \
                (file_id, scrape_starttime))

        # with open('%s_facebook_statuses.csv' % file_id, 'rb') as csvfile:
        with open_csv_w('../output/%s_facebook_statuses.csv' % file_id,
                        'rb') as csvfile:
            reader = csv.DictReader(csvfile)

            #reader = [dict(status_id='759985267390294_1158001970921953')]

            for status in reader:
                has_next_page = True

                comments = getFacebookCommentFeedData(status['status_id'],
                                                      access_token, 100)

                while has_next_page and comments is not None:
                    for comment in comments['data']:
                        w.writerow(
                            processFacebookComment(comment,
                                                   status['status_id']))

                        if 'comments' in comment:
                            has_next_subpage = True

                            subcomments = getFacebookCommentFeedData(
                                comment['id'], access_token, 100)

                            while has_next_subpage:
                                for subcomment in subcomments['data']:
                                    # print (processFacebookComment(
                                    # subcomment, status['status_id'],
                                    # comment['id']))
                                    w.writerow(
                                        processFacebookComment(
                                            subcomment, status['status_id'],
                                            comment['id']))

                                    num_processed += 1
                                    if num_processed % 1000 == 0:
                                        print("%s Comments Processed: %s" % \
                                                (num_processed,
                                                    datetime.datetime.now()))

                                if 'paging' in subcomments:
                                    if 'next' in subcomments['paging']:
                                        subcomments = json.loads(
                                                request_until_succeed(
                                                    subcomments['paging']\
                                                               ['next']))
                                    else:
                                        has_next_subpage = False
                                else:
                                    has_next_subpage = False

                        # output progress occasionally to make sure code is not
                        # stalling
                        num_processed += 1
                        if num_processed % 1000 == 0:
                            print("%s Comments Processed: %s" % \
                                    (num_processed, datetime.datetime.now()))

                    if 'paging' in comments:
                        if 'next' in comments['paging']:
                            comments = json.loads(
                                request_until_succeed(
                                    comments['paging']['next']))
                        else:
                            has_next_page = False
                    else:
                        has_next_page = False


        print("\nDone!\n%s Comments Processed in %s" % \
                (num_processed, datetime.datetime.now() - scrape_starttime))
def get_all_tweets(screen_name):
    #Twitter only allows access to a users most recent 3240 tweets with this method

    #authorize twitter, initialize tweepy
    auth = tweepy.OAuthHandler(TWITTER_C_KEY, TWITTER_C_SECRET)
    auth.set_access_token(TWITTER_A_KEY, TWITTER_A_SECRET)
    api = tweepy.API(auth)

    #initialize a list to hold all the tweepy Tweets
    alltweets = []

    #make initial request for most recent tweets (200 is the maximum allowed count)
    new_tweets = api.user_timeline(screen_name=screen_name, count=200)

    #save most recent tweets
    alltweets.extend(new_tweets)

    #save the id of the oldest tweet less one
    oldest = alltweets[-1].id - 1

    #keep grabbing tweets until there are no tweets left to grab
    while len(new_tweets) > 0:
        print("getting tweets before %s" % (oldest))

        #all subsiquent requests use the max_id param to prevent duplicates
        new_tweets = api.user_timeline(screen_name=screen_name,
                                       count=200,
                                       max_id=oldest)

        #save most recent tweets
        alltweets.extend(new_tweets)

        #update the id of the oldest tweet less one
        oldest = alltweets[-1].id - 1

        print("...%s tweets downloaded so far" % (len(alltweets)))

    #transform the tweepy tweets into a 2D array that will populate the csv	| you can comment out data you don't need
    outtweets = [[
        tweet.id_str, tweet.created_at, tweet.favorite_count,
        tweet.retweet_count, tweet.retweeted, tweet.source, tweet.text,
        tweet.geo, tweet.lang, tweet.is_quote_status, tweet.user.name,
        tweet.user.screen_name, tweet.user.location, tweet.user.description,
        tweet.user.protected, tweet.user.followers_count,
        tweet.user.friends_count, tweet.user.listed_count,
        tweet.user.created_at, tweet.user.favourites_count,
        tweet.user.utc_offset, tweet.user.time_zone, tweet.user.geo_enabled,
        tweet.user.verified, tweet.user.statuses_count, tweet.user.lang
    ] for tweet in alltweets]

    #write the csv
    with open_csv_w('%s_tweets.csv' % screen_name) as f:
        writer = csv.writer(f)
        writer.writerow([
            "id", "created_at", "favorites", "retweets", "retweeted", "source",
            "text", "geolocation", "language", "is_quote_status", "username",
            "user_screen_name", "user_location", "user_description",
            "user_protected", "user_followers_count", "user_friends_count",
            "user_listed_count", "user_created_at", "user_favourites_count",
            "user_utc_offset", "user_time_zone", "user_geo_enabled",
            "user_verified", "user_statuses_count", "user_lang"
        ])
        writer.writerows(outtweets)

    pass