Exemplo n.º 1
0
def getTweets():
    time_per_100       = []
    fetch_count        = 0
    avg_time_per_fetch = 0
    tweet_count        = 0
    modified_at        = None


    # disable all HTTPS/TLS warnings
    urllib3.disable_warnings()

    # load api keys from api_keys.json
    keys_file_path = os.path.join(project_root, 'Key', 'api_keys.json')

    with open(keys_file_path) as api_keys:    
        keys = json.load(api_keys)

    # obtain multiple instances of Twitter API to circumvent rate limit
    authList = []
    apiList  = []
    for i in range(len(keys)):
        authList[i] = tweepy.OAuthHandler(keys[i]['consumer_key'], keys[i]['consumer_secret'])
        authList[i].set_access_token(keys[i]['access_token'], keys[i]['access_token_secret'])
        apiList[i] = tweepy.API(authList[i])

    # db_path = os.path.join(os.path.dirname(__file__), os.pardir, 'Data/tweet_ids') 

    # connect to DB
    db = connect('Shit_db')

    # drop the database once to ensure whenever the thread is run, we create the db afresh
    db.drop_database('Shit_db')
    t0 = time.time()
    no_of_tweets = 0
    total_no_of_tweets = 0

    # news channel IDs
    # user_id = [34908698,362051343,361501426,7905122,180306960,30857481,28370738,110458336,2883841,28172926,30261067,20562637,113050195,28140646,621523,35773039,15164565,15861355,44316192,44078873,15861220,1642135962,28137012,38400130,32355144,122097236,19230601,713993413,7302282,16877611,2557521,26257166,15110357,4898091,34713362,18949452,32359921,16334857,59736898,214007688,129834917,15108702,39817941,375721095,2424208849,506504366,242689065,116559622,23484039,18424289,64643056,115754870,134758540,6509832,267158021,29958928,15954704,19897138,37034483,36327407,20751449,3123883238,240649814,31632905,177829660,256495314,39743812,245687754,38647512,355989081,98362607,17710740,39240673,17469289,16973333,87818409,18071358,9763482,87416722,4970411,7587032,788524,14173315,612473,28785486,2467791,15012486,5988062,1367531,759251,428333,6017542,3108351,51241574,1652541,14293310,807095,742143,5402612]
    
    # non news channel IDs
    user_id = [79708561,281766200,785493949,250205792,180463340,3060210854,2305049443,273181052,2463499796,71876190,26642006,92367751,259379883,399428964,26565946,24494557,166739404,52551600,25365536,15485441,15846407,14234323,125481462,27042513,133880286,243284052,44588485,51376979,27260086,17919972,18625669,16409683,21447363,58135085,23375688,92724677,30973,50374439,48410093,57928790,87170183,102957248,108391251,120998613,115622213,113419517,6463042,94775494,131975194,97865628,79915337,332188446,41067945,197150180,78022296,31348594,902534288,108253263,63390627,145125358,78242874,468479147,36057824,34464376,111871312,152251488,121677709,38403110,21787625,494747331,94163409,44849431,18872373,105710210,148248527,38479920,508932270,183230911,186388502,101311381,70652594,2719753171,23976386,23002923,33868638,16548023,40453512,18681139,279449435,144755081,132385468,54829997,266714730,108252113,3138637447,1111706414,61755650,14120922,216447259,129786468]
    print "No of users={0}".format(len(user_id))
    
    last_id        = [None for i in range(len(user_id))]
    number         = 0
    rate_limit     = 180
    no_of_requests = 0
    tweet_list     = []
    k              = 0 # current_api_index
    api_wait_end_time  = time.time() # stores timestamp till when the first API might have to wait

    while(total_no_of_tweets < 3200 * len(user_id)):
        try:
            status_obj = apiList[k].user_timeline(user_id = user_id[number], count = 200, max_id = last_id[number])
            # print "fetched {0} tweets".format(len(status_obj))
            no_of_requests += 1
            for status in status_obj:
                tweet                       = Tweet()
                tweet.tweet_id              = status.id_str
                tweet.text                  = status.text
                tweet.created_at            = status.created_at
                tweet.in_reply_to_status_id = status.in_reply_to_status_id_str 
                tweet.user_id               = status.user.id_str
                tweet.user_name             = status.user.name
                tweet.user_followers        = status.user.followers_count
                tweet.user_location         = status.user.location
                tweet.favourites_count      = status.user.favourites_count
                if status.coordinates is not None:
                    tweet.coordinates       = status.coordinates['coordinates']
                tweet.language              = status.lang
                # tweet.place_coordinates   = status['']
                tweet.retweet_count         = status.retweet_count
                tweet.retweeted             = status.retweeted
                # tweet.inserted_at
                tweet.is_news               = True
                # tweet.save()
                tweet_list.append(tweet)
                no_of_tweets  = no_of_tweets + 1
                total_no_of_tweets = total_no_of_tweets + 1
                last_id[number] = tweet.tweet_id
                #print(tweet.user_name)
            # print "last id={0}".format(last_id[number])
            # print "total no of tweets {0}".format(no_of_tweets)
            if no_of_requests%100 == 0:
                    print "{0} tweets fetched".format(total_no_of_tweets)
                    
            if(no_of_tweets >= 3200):
                print "Saving Tweets to DB"
                # save tweets to db
                Tweet.objects.insert(tweet_list)
                tweet_list = []
                number += 1

                # if we have fetched tweets for every user, just return
                if number > len(user_id):
                    return
                number = number % len(user_id)
                print "moved to {0} user".format(number)
                no_of_tweets = 0

        except tweepy.RateLimitError:
            print "Saving Tweets to DB"
            # save tweets to db
            Tweet.objects.insert(tweet_list)
            tweet_list = [] 
            if k == len(apiList) - 1:
                if api_wait_end_time < time.time():
                    # we dont need to wait, so pass
                    pass
                else:
                    sleep_time = api_wait_end_time - time.time()
                    print "create_db: sleeping for {0} seconds".format(sleep_time)
                    time.sleep(sleep_time)
            k = (k + 1) % len(apiList)
            if k == 0:
                # update api_wait_end_time
                api_wait_end_time = time.time() + 15*60

            # print("Going to Sleep")
            # print no_of_requests
            # t0 = time.time() - t0
            # if t0 > 16*60:
            #     print "sleeping for {0} sec".format(15*60) 
            #     time.sleep(15*60) 
            # else:
            #     print "sleeping for {0} sec".format(16*60 - t0)
            #     time.sleep(16*60 - t0)
            # t0 = time.time()

        except Exception as e:
            print("exception came")
            print(str(e))
            time.sleep(15 * 60)
    t0 = time.time() - t0
    t0 = t0/60
    print(t0)
    return
Exemplo n.º 2
0
def scrapeTweets():
    time_per_100       = []
    fetch_count        = 0
    avg_time_per_fetch = 0
    tweet_count        = 0
    target             = 1000000
    modified_at        = None


    # disable all HTTPS/TLS warnings
    urllib3.disable_warnings()

    # load api keys from api_keys.json
    keys_file_path = os.path.join(os.path.dirname(__file__), os.pardir, 'api_keys.json')

    with open(keys_file_path) as api_keys:    
        keys = json.load(api_keys)

    # provide auth params & obtain an instance of API
    auth = tweepy.OAuthHandler(keys['consumer_key'], keys['consumer_secret'])
    auth.set_access_token(keys['access_token'], keys['access_token_secret'])

    api = tweepy.API(auth)

    db_path = os.path.join(os.path.dirname(__file__), os.pardir, 'Data/tweet_ids')

    # connect to DB
    db = connect('Tweets')

    # drop the database once to ensure whenever the thread is run, we create the db afresh
    db.drop_database('Tweets')

    tweet_id_list = []

    

    with open(db_path) as file_db:
        t0 = time.time()
        for line in file_db:
            status_obj = None
            tweet_id = line.split("\t")[0]
            tweet_id_list.append(tweet_id)
            if(len(tweet_id_list) == 100):
                try:
                    status_obj = api.statuses_lookup(tweet_id_list, [False], [False], [True])
                    for status in status_obj:
                        tweet                       = Tweet()
                        tweet.tweet_id              = status.id_str
                        tweet.text                  = status.text
                        tweet.created_at            = status.created_at
                        tweet.in_reply_to_status_id = status.in_reply_to_status_id_str 
                        tweet.user_id               = status.user.id_str
                        tweet.user_name             = status.user.name
                        tweet.user_followers        = status.user.followers_count
                        tweet.user_location         = status.user.location
                        tweet.favourites_count      = status.user.favourites_count
                        if status.coordinates is not None:
                            tweet.coordinates       = status.coordinates['coordinates']
                        tweet.language              = status.lang
                        # tweet.place_coordinates   = status['']
                        tweet.retweet_count         = status.retweet_count
                        tweet.retweeted             = status.retweeted
                        # tweet.inserted_at
                        tweet.is_news               = None
                        tweet.save()
                    t1 = time.time()
                    time_per_100.append(t1-t0)
                    fetch_count = fetch_count + 1
                    avg_time_per_fetch = sum(time_per_100)/len(time_per_100)
                    tweet_count += len(status_obj)
                    modified_at = datetime.datetime.now().strftime('%H:%M:%S %d-%m-%Y')
                    print("Scraped {0} tweets, Total ={1} tweets".format(
                        len(status_obj), tweet_count))

                    # save all the stats to REDIS
                    r.set('tweet_count', tweet_count)
                    r.set('avg_time_per_fetch', avg_time_per_fetch)
                    r.set('fetch_count', fetch_count)
                    r.set('modified_at', modified_at)
                    # r.set('target', target) 

                except tweepy.RateLimitError:
                    print("Going to Sleep")
                    time.sleep(15 * 60)
        		except Exception as e:
        		    print(str(e))
    	            time.sleep(15 * 60)
                finally:
Exemplo n.º 3
0
def getTweets():
    time_per_100 = []
    fetch_count = 0
    avg_time_per_fetch = 0
    tweet_count = 0
    modified_at = None

    # disable all HTTPS/TLS warnings
    urllib3.disable_warnings()

    # load api keys from api_keys.json
    keys_file_path = os.path.join(project_root, 'Key', 'api_keys.json')

    with open(keys_file_path) as api_keys:
        keys = json.load(api_keys)

    # obtain multiple instances of Twitter API to circumvent rate limit
    authList = []
    apiList = []
    for i in range(len(keys)):
        authList[i] = tweepy.OAuthHandler(keys[i]['consumer_key'],
                                          keys[i]['consumer_secret'])
        authList[i].set_access_token(keys[i]['access_token'],
                                     keys[i]['access_token_secret'])
        apiList[i] = tweepy.API(authList[i])

    # db_path = os.path.join(os.path.dirname(__file__), os.pardir, 'Data/tweet_ids')

    # connect to DB
    db = connect('Shit_db')

    # drop the database once to ensure whenever the thread is run, we create the db afresh
    db.drop_database('Shit_db')
    t0 = time.time()
    no_of_tweets = 0
    total_no_of_tweets = 0

    # news channel IDs
    # user_id = [34908698,362051343,361501426,7905122,180306960,30857481,28370738,110458336,2883841,28172926,30261067,20562637,113050195,28140646,621523,35773039,15164565,15861355,44316192,44078873,15861220,1642135962,28137012,38400130,32355144,122097236,19230601,713993413,7302282,16877611,2557521,26257166,15110357,4898091,34713362,18949452,32359921,16334857,59736898,214007688,129834917,15108702,39817941,375721095,2424208849,506504366,242689065,116559622,23484039,18424289,64643056,115754870,134758540,6509832,267158021,29958928,15954704,19897138,37034483,36327407,20751449,3123883238,240649814,31632905,177829660,256495314,39743812,245687754,38647512,355989081,98362607,17710740,39240673,17469289,16973333,87818409,18071358,9763482,87416722,4970411,7587032,788524,14173315,612473,28785486,2467791,15012486,5988062,1367531,759251,428333,6017542,3108351,51241574,1652541,14293310,807095,742143,5402612]

    # non news channel IDs
    user_id = [
        79708561, 281766200, 785493949, 250205792, 180463340, 3060210854,
        2305049443, 273181052, 2463499796, 71876190, 26642006, 92367751,
        259379883, 399428964, 26565946, 24494557, 166739404, 52551600,
        25365536, 15485441, 15846407, 14234323, 125481462, 27042513, 133880286,
        243284052, 44588485, 51376979, 27260086, 17919972, 18625669, 16409683,
        21447363, 58135085, 23375688, 92724677, 30973, 50374439, 48410093,
        57928790, 87170183, 102957248, 108391251, 120998613, 115622213,
        113419517, 6463042, 94775494, 131975194, 97865628, 79915337, 332188446,
        41067945, 197150180, 78022296, 31348594, 902534288, 108253263,
        63390627, 145125358, 78242874, 468479147, 36057824, 34464376,
        111871312, 152251488, 121677709, 38403110, 21787625, 494747331,
        94163409, 44849431, 18872373, 105710210, 148248527, 38479920,
        508932270, 183230911, 186388502, 101311381, 70652594, 2719753171,
        23976386, 23002923, 33868638, 16548023, 40453512, 18681139, 279449435,
        144755081, 132385468, 54829997, 266714730, 108252113, 3138637447,
        1111706414, 61755650, 14120922, 216447259, 129786468
    ]
    print "No of users={0}".format(len(user_id))

    last_id = [None for i in range(len(user_id))]
    number = 0
    rate_limit = 180
    no_of_requests = 0
    tweet_list = []
    k = 0  # current_api_index
    api_wait_end_time = time.time(
    )  # stores timestamp till when the first API might have to wait

    while (total_no_of_tweets < 3200 * len(user_id)):
        try:
            status_obj = apiList[k].user_timeline(user_id=user_id[number],
                                                  count=200,
                                                  max_id=last_id[number])
            # print "fetched {0} tweets".format(len(status_obj))
            no_of_requests += 1
            for status in status_obj:
                tweet = Tweet()
                tweet.tweet_id = status.id_str
                tweet.text = status.text
                tweet.created_at = status.created_at
                tweet.in_reply_to_status_id = status.in_reply_to_status_id_str
                tweet.user_id = status.user.id_str
                tweet.user_name = status.user.name
                tweet.user_followers = status.user.followers_count
                tweet.user_location = status.user.location
                tweet.favourites_count = status.user.favourites_count
                if status.coordinates is not None:
                    tweet.coordinates = status.coordinates['coordinates']
                tweet.language = status.lang
                # tweet.place_coordinates   = status['']
                tweet.retweet_count = status.retweet_count
                tweet.retweeted = status.retweeted
                # tweet.inserted_at
                tweet.is_news = True
                # tweet.save()
                tweet_list.append(tweet)
                no_of_tweets = no_of_tweets + 1
                total_no_of_tweets = total_no_of_tweets + 1
                last_id[number] = tweet.tweet_id
                #print(tweet.user_name)
            # print "last id={0}".format(last_id[number])
            # print "total no of tweets {0}".format(no_of_tweets)
            if no_of_requests % 100 == 0:
                print "{0} tweets fetched".format(total_no_of_tweets)

            if (no_of_tweets >= 3200):
                print "Saving Tweets to DB"
                # save tweets to db
                Tweet.objects.insert(tweet_list)
                tweet_list = []
                number += 1

                # if we have fetched tweets for every user, just return
                if number > len(user_id):
                    return
                number = number % len(user_id)
                print "moved to {0} user".format(number)
                no_of_tweets = 0

        except tweepy.RateLimitError:
            print "Saving Tweets to DB"
            # save tweets to db
            Tweet.objects.insert(tweet_list)
            tweet_list = []
            if k == len(apiList) - 1:
                if api_wait_end_time < time.time():
                    # we dont need to wait, so pass
                    pass
                else:
                    sleep_time = api_wait_end_time - time.time()
                    print "create_db: sleeping for {0} seconds".format(
                        sleep_time)
                    time.sleep(sleep_time)
            k = (k + 1) % len(apiList)
            if k == 0:
                # update api_wait_end_time
                api_wait_end_time = time.time() + 15 * 60

            # print("Going to Sleep")
            # print no_of_requests
            # t0 = time.time() - t0
            # if t0 > 16*60:
            #     print "sleeping for {0} sec".format(15*60)
            #     time.sleep(15*60)
            # else:
            #     print "sleeping for {0} sec".format(16*60 - t0)
            #     time.sleep(16*60 - t0)
            # t0 = time.time()

        except Exception as e:
            print("exception came")
            print(str(e))
            time.sleep(15 * 60)
    t0 = time.time() - t0
    t0 = t0 / 60
    print(t0)
    return
Exemplo n.º 4
0
def scrapeTweets():
    print 'entered scrapeTweets'
	# disable all HTTPS/TLS warnings
    urllib3.disable_warnings()

    global total_tweets_retrieved

    # load api keys from api_keys.json
    keys_file_path = os.path.join(os.path.dirname(__file__), os.pardir, 'api_keys.json')

    with open(keys_file_path) as api_keys:    
        keys = json.load(api_keys)

    # provide auth params & obtain an instance of API
    auth = tweepy.OAuthHandler(keys['consumer_key'], keys['consumer_secret'])
    auth.set_access_token(keys['access_token'], keys['access_token_secret'])

    api = tweepy.API(auth)

    relevant_tweet_ids_path = os.path.join(os.path.dirname(__file__), os.pardir, 'Data/relevance_judgments')

    # connect to DB
    db = connect('Tweets')
    with open(relevant_tweet_ids_path) as relevant_tweet_ids:
        print 'opened relevance_judgments'
        t0 = time.time()
        for line in relevant_tweet_ids:
            status_obj = None
            tweet_id = line.split(" ")[0]
            tweets = Tweet.objects(tweet_id=tweet_id).all()
            # fetch tweet only if it doesnt exist
            if tweets.count() == 0:
                # if it doesn't exist, fetch from Twitter
                try:
                    print "Fetching tweet={0}".format(tweet_id)
                    status = api.get_status(tweet_id)
                    print status.id_str
                    total_tweets_retrieved      = total_tweets_retrieved + 1
                    tweet                       = Tweet()
                    tweet.tweet_id              = status.id_str
                    tweet.text                  = status.text
                    tweet.created_at            = status.created_at
                    tweet.in_reply_to_status_id = status.in_reply_to_status_id_str 
                    tweet.user_id               = status.user.id_str
                    tweet.user_name             = status.user.name
                    tweet.user_followers        = status.user.followers_count
                    tweet.user_location         = status.user.location
                    tweet.favourites_count      = status.user.favourites_count
                    if status.coordinates is not None:
                        tweet.coordinates       = status.coordinates['coordinates']
                    tweet.language              = status.lang
                    # tweet.place_coordinates   = status['']
                    tweet.retweet_count         = status.retweet_count
                    tweet.retweeted             = status.retweeted
                    # tweet.inserted_at
                    tweet.is_news               = True
                    tweet.save()
                except tweepy.RateLimitError:
                    print("Going to Sleep")
                    time.sleep(15 * 60)
                    print status.id_str
                except Exception as e:
                    print str(e)
            else:
                # if it does exist, then just label it as news
                for t in tweets:
                    t.is_news = True
                    t.save()
    print "Total Time={0}".format(time.time()-t0)
    print "Total Tweets={0}".format(total_tweets_retrieved)