예제 #1
0
def query_user_meta(user_id, api_pool, context):
    '''
    Queries twitter for followers ids from id_list.
    '''
    creds = api_pool.get_current_api_creds()
    function = 'users/lookup'
    the_url = "https://api.twitter.com/1.1/{}.json".format(function)

    # filter users ABD
    user_id = [u_id for u_id in user_id if user_not_downloaded(u_id, context)]

    # chunk the user_ids into lists of 100 per API call.
    id_count = 0
    for i, user_chunks in enumerate(chunker(user_id, 100)):
        api_pool.set_increment()
        if api_pool.get_current_api_calls() % 300 == 0:
            api_pool.find_next_token()
            creds = api_pool.get_current_api_creds()
        
        parameters = [('user_id', ','.join(user_chunks))]
        try:
            out = twitterreq(the_url, creds=creds, parameters=parameters)
            resp_code = out.code
        except SocketError as e: #connection reset by peer
            log(e)
            resp_code = 104
        
        if resp_code == 200:
            try:
                response = json.loads(out.read().decode('utf-8'))
                for row in response:
                    process_row(row, context)
                id_count += len(response)
                log("Iternation: {} Total_IDs: {}".format(i, id_count))
                time.sleep(1)
            
            except SocketError as e: # 104 sometimes shows up when we read the out.
                log("Likely a 104 error! {}".format(e))
                time.sleep(60 * 60)
        
        elif resp_code in [404, 400, 410, 422, 401]: # error with data, log it, 401 means private user!
            
            log("Iteration: {} fruitless with error {}".format(i, resp_code))

        elif resp_code in [420, 429, 406]: # rate limited, try again
            log("Iternation: {} rate limited with error {}".format(i, resp_code))
            time.sleep(901)
            api_pool.find_next_token()
            creds = api_pool.get_current_api_creds()

        elif resp_code in [500, 502, 503, 504, 104]: # server error, wait, try again.
            log("Iternation: {} server error {}".format(i, resp_code))
            time.sleep(60 * 60)

        else: # some other error, just break...
            log("Iternation: {} unknown error {}".format(i, resp_code))
            break

        # send an update to s3 after each iteration!
        s3.disk_2_s3(context['log'], context['s3_log'])
예제 #2
0
def twitter_query(context):
    '''
    Gets user ids, and feeds them into a function to query twitter.
    '''
    input_file = context['input']
    auth_file = context['auth']
    id_list = get_id_list(input_file)
    offset = context['start_idx_input']
    start_idx = context['start_idx_api']

    log('Creating oauth pool...')
    api_pool = TweepyPool(auth_file)
    for i, user_id in enumerate(id_list[offset:]):
        filename, s3_filename = get_user_id_file(user_id, context)
        if not s3.file_exists(s3_filename):
            log('writing user id: {} here'.format(user_id, filename))

            with open(filename, 'w+') as write_fd:
                for item in Cursor(api_pool.friends, id=user_id,
                                   count=5000).items():
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_original_user_id'] = user_id
                    tweet_item['smapp_timestamp'] = datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000')
                    write_fd.write(json.dumps(tweet_item) + '\n')

            log('Sending file to s3: {}'.format(s3_filename))
            s3.disk_2_s3(filename, s3_filename)
            s3.disk_2_s3(context['log'], context['s3_log'])
            os.remove(filename)
        else:
            log('{} already queried!!!'.format(user_id))
        log('>>> {} out of {}'.format(i + offset, len(id_list)))
        time.sleep(1)
예제 #3
0
def prep_s3(context):
    '''
    Uploads the api tokens, claiming them from further use.
    '''
    log(">>> Start {}".format(datetime.datetime.now()))
    s3.disk_2_s3(context['log'], context['s3_log'])
    s3.disk_2_s3(context['auth'], context['s3_auth'])
def query_user_tweets(output, id_list, auth_file, max_id=-1, since_id=-1):
    '''
    queries twitter for users from id_list and authentication from auth_file.
    '''
    num_inputs_queried = 0
    api_pool = TweepyPool(auth_file)
    write_fd = open(output, 'a+')
    for userid in id_list:
        num_inputs_queried = num_inputs_queried + 1
        # even though the count is 200 we can cycle through 3200 items.
        # if you put a count variable in this cursor it will iterate up
        # to about 3200
        if not userid == '':
            try:
                count = 0
                if max_id and since_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    max_id=max_id,
                                    since_id=since_id,
                                    tweet_mode='extended')
                elif max_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    max_id=max_id,
                                    tweet_mode='extended')
                elif since_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    since_id=since_id,
                                    tweet_mode='extended')
                else:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    tweet_mode='extended')

                for item in cursor.items():
                    count = count + 1
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_timestamp'] = (datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000'))
                    write_fd.write(json.dumps(tweet_item))
                    write_fd.write('\n')
            except TweepError as e:
                log('tweepy error: {}'.format(e))
            log('counted {} objects for input {}'.format(count, userid))
        log('number of inputs queried so far: {}'.format(num_inputs_queried))
        s3.disk_2_s3(context['log'], context['s3_log'])
    write_fd.close()
예제 #5
0
def process_row(row, context):
    '''
    Parses JSON response from Twitter API.
    Writes to local disk and then sents to s3.
    '''
    user_meta = row.copy()
    user_meta['smapp_timestamp'] = (datetime.datetime.
        utcnow().strftime('%Y-%m-%d %H:%M:%S +0000'))
    filename, s3_filename, s3_id_key = get_user_id_file(
        str(user_meta['id']), 
        context
    )
    with open(filename, 'w+') as f:
        f.write(json.dumps(user_meta) + '\n')

    # move to s3.
    s3.disk_2_s3(filename, s3_filename)
    os.remove(filename)
def twitter_query(context):
    '''
    Gets user ids, and feeds them into a function to query twitter.
    '''
    input_file = context['input']
    auth_file = context['auth']
    id_list = get_id_list(input_file)
    offset = context['start_idx_input']
    start_idx = context['start_idx_api']

    log('Creating oauth pool...')
    api_pool = kids_pool(auth_file, start_idx=start_idx, verbose=1)

    for i, user_id in enumerate(id_list[offset:]):
        if i == 0:  # first cursor, only if flag is set.
            cursor = context['cursor']
        else:
            cursor = -1
        filename, s3_filename, s3_id_key = get_user_id_file(user_id, context)
        if not s3.exists(s3_id_key):
            query_user_friends_ids(filename, user_id, api_pool, cursor=cursor)
            log('Sending file to s3: {}'.format(s3_filename))
            s3.disk_2_s3(filename, s3_filename)
            s3.disk_2_s3(context['log'], context['s3_log'])
            os.remove(filename)
            # send an update to s3 after each iteration!
            s3.disk_2_s3(context['log'], context['s3_log'])
        else:
            log('{} already queried!!!'.format(user_id))
        log('>>> {} out of {}'.format(i + offset, len(id_list)))
        time.sleep(1)
        else:  # some other error, just break...
            log("Iteration: {} unknown error {}".format(i, resp_code))
            break

        # send an update to s3 after each iteration!
        s3.disk_2_s3(context['log'], context['s3_log'])


if __name__ == '__main__':
    '''
    This script downloads friends or follower ids locally as csv files,
    after each sucessful download, the log and the csv are uploaded to s3.
    When all users have been successfully downloaded, the credits are freed up,
    and the log is moved to the archive.
    The DO machine used for this query is then destroyed.
    '''
    args = parse_args(sys.argv[1:])
    context = build_context(args)
    logging.basicConfig(filename=context['log'], level=logging.INFO)
    context['volume'] = check_vol_attached(context)
    if context['volume']:  # check if volume is attached
        create_token_files(context)
        prep_s3(context)
        twitter_query(context)
        context['output_bz2'] = bzip2(context)
        s3.disk_2_s3(context['output_bz2'], context['s3_path'])
        settle_affairs_in_s3(context)
        detach_and_destroy_volume(context)
        destroy_droplet(context)