Пример #1
0
def query_user_friends_ids(output, id_list, auth_file):

    logger = logging.getLogger(__name__)

    num_inputs_queried = 0

    #create the api pool
    api_pool = TweepyPool(auth_file)

    write_fd = open(output, 'w+')

    for userid in id_list:
        num_inputs_queried = num_inputs_queried + 1
        if not userid == '':
            try:
                count = 0
                for item in Cursor(api_pool.friends_ids, id=userid).items():
                    logger.debug('user id: {}'.format(item))
                    count = count + 1
                    tweet_item = {'id': item}
                    tweet_item['smapp_original_user_id'] = userid
                    tweet_item['smapp_timestamp'] = datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000')
                    write_fd.write(json.dumps(tweet_item))
                    write_fd.write('\n')
            except TweepError as e:
                logger.info('tweepy error: %s', e)
            logger.info('counted %s objects for input %s', count, userid)
        logger.info('number of inputs queried so far: %s', num_inputs_queried)
    write_fd.close()
Пример #2
0
def twitter_query(context):
    '''
    Gets user ids, and feeds them into a function to query twitter.
    '''
    input_file = context['input']
    auth_file = context['auth']
    id_list = get_id_list(input_file)
    offset = context['start_idx_input']
    start_idx = context['start_idx_api']

    log('Creating oauth pool...')
    api_pool = TweepyPool(auth_file)
    for i, user_id in enumerate(id_list[offset:]):
        filename, s3_filename = get_user_id_file(user_id, context)
        if not s3.file_exists(s3_filename):
            log('writing user id: {} here'.format(user_id, filename))

            with open(filename, 'w+') as write_fd:
                for item in Cursor(api_pool.friends, id=user_id,
                                   count=5000).items():
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_original_user_id'] = user_id
                    tweet_item['smapp_timestamp'] = datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000')
                    write_fd.write(json.dumps(tweet_item) + '\n')

            log('Sending file to s3: {}'.format(s3_filename))
            s3.disk_2_s3(filename, s3_filename)
            s3.disk_2_s3(context['log'], context['s3_log'])
            os.remove(filename)
        else:
            log('{} already queried!!!'.format(user_id))
        log('>>> {} out of {}'.format(i + offset, len(id_list)))
        time.sleep(1)
Пример #3
0
def query_user_tweets(output, id_list, auth_file):

    logger = logging.getLogger(__name__)

    num_inputs_queried = 0

    #create the api pool
    api_pool = TweepyPool(auth_file)

    write_fd = open(output, 'w+')

    for userid in id_list:
        num_inputs_queried = num_inputs_queried + 1
        # even though the count is 200 we can cycle through 3200 items.
        # if you put a count variable in this cursor it will iterate up
        # to about 3200
        if not userid == '':
            try:
                count = 0
                for item in Cursor(api_pool.user_timeline,
                                   user_id=userid,
                                   count=200).items():
                    logger.debug('tweet text: %s', item.text)
                    count = count + 1
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_timestamp'] = datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000')
                    write_fd.write(json.dumps(tweet_item))
                    write_fd.write('\n')
            except TweepError as e:
                logger.info('tweepy error: %s', e)
            logger.info('counted %s objects for input %s', count, userid)
        logger.info('number of inputs queried so far: %s', num_inputs_queried)
    write_fd.close()
Пример #4
0
def query_search_tweets(output, terms_list, auth_file):

    logger = logging.getLogger(__name__)

    num_inputs_queried = 0

    #create the api pool
    api_pool = TweepyPool(auth_file)

    write_fd = open(output, 'w+')

    for term in terms_list:
        num_inputs_queried = num_inputs_queried + 1
        count = 0
        if not term == '':
            try:
                for item in Cursor(api_pool.search,
                                   q=urllib.parse.quote(term)).items():
                    logger.debug('tweet text: %s', item.text)
                    count = count + 1
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_term'] = term
                    tweet_item['smapp_count'] = count
                    tweet_item['smapp_timestamp'] = datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000')
                    write_fd.write(json.dumps(tweet_item))
                    write_fd.write('\n')
            except TweepError as e:
                logger.info('tweepy error: %s', e)
            logger.info('counted %s objects for input %s', count, term)
        logger.info('number of inputs queried so far: %s', num_inputs_queried)
    write_fd.close()
Пример #5
0
def query_search_tweets(output, terms_list, auth_file):

    logger = logging.getLogger(__name__)

    num_inputs_queried = 0

    #create the api pool
    api_pool = TweepyPool(auth_file)
    write_fd = open(output, 'w+')

    hundred_terms = []

    for term in terms_list:
        if len(hundred_terms) == 100:
            num_inputs_queried = num_inputs_queried + 1
            count = 0
            if not term == '':
                try:
                    for item in api_pool.statuses_lookup(hundred_terms):
                        print(item)
                        logger.debug('tweet text: %s', item.text)
                        count = count + 1
                        tweet_item = json.loads(json.dumps(item._json))
                        tweet_item['smapp_term'] = term
                        tweet_item['smapp_count'] = count
                        tweet_item[
                            'smapp_timestamp'] = datetime.datetime.utcnow(
                            ).strftime('%Y-%m-%d %H:%M:%S +0000')
                        write_fd.write(json.dumps(tweet_item))
                        write_fd.write('\n')
                except TweepError as e:
                    logger.info('tweepy error: %s', e)
                logger.info('counted %s objects for input %s', count, term)
            logger.info('number of inputs queried so far: %s',
                        num_inputs_queried)
            # if we come in and the list already has 100, that means
            # we queried that 100
            hundred_terms = [term]
        else:
            hundred_terms.append(term)
    write_fd.close()
def query_user_tweets(output, id_list, auth_file, max_id=-1, since_id=-1):
    '''
    queries twitter for users from id_list and authentication from auth_file.
    '''
    num_inputs_queried = 0
    api_pool = TweepyPool(auth_file)
    write_fd = open(output, 'a+')
    for userid in id_list:
        num_inputs_queried = num_inputs_queried + 1
        # even though the count is 200 we can cycle through 3200 items.
        # if you put a count variable in this cursor it will iterate up
        # to about 3200
        if not userid == '':
            try:
                count = 0
                if max_id and since_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    max_id=max_id,
                                    since_id=since_id,
                                    tweet_mode='extended')
                elif max_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    max_id=max_id,
                                    tweet_mode='extended')
                elif since_id:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    since_id=since_id,
                                    tweet_mode='extended')
                else:
                    cursor = Cursor(api_pool.user_timeline,
                                    user_id=userid,
                                    count=200,
                                    tweet_mode='extended')

                for item in cursor.items():
                    count = count + 1
                    tweet_item = json.loads(json.dumps(item._json))
                    tweet_item['smapp_timestamp'] = (datetime.datetime.utcnow(
                    ).strftime('%Y-%m-%d %H:%M:%S +0000'))
                    write_fd.write(json.dumps(tweet_item))
                    write_fd.write('\n')
            except TweepError as e:
                log('tweepy error: {}'.format(e))
            log('counted {} objects for input {}'.format(count, userid))
        log('number of inputs queried so far: {}'.format(num_inputs_queried))
        s3.disk_2_s3(context['log'], context['s3_log'])
    write_fd.close()
Пример #7
0
def twitter_query(context):
    '''
    Gets user ids, and feeds them into a function to query twitter.
    '''
    output = context['output']
    input_file = context['input']
    auth_file = context['auth']

    log('creating oauth pool...')
    id_list = get_id_list(input_file)

    log('creating oauth pool...')
    api_pool = TweepyPool(auth_file)

    log('starting query...')
    num_inputs_queried = 0
    with open(output, 'w+') as write_fd:
        for user_id in id_list:
            num_inputs_queried = num_inputs_queried + 1
            if not user_id == '':
                try:
                    count = 0
                    for item in Cursor(api_pool.followers,
                                       id=user_id,
                                       count=5000).items():
                        log('user id: {},  and screen_name {}'.format(
                            item.id, item.screen_name))
                        count = count + 1
                        tweet_item = json.loads(json.dumps(item._json))
                        tweet_item['smapp_original_user_id'] = user_id
                        tweet_item[
                            'smapp_timestamp'] = datetime.datetime.utcnow(
                            ).strftime('%Y-%m-%d %H:%M:%S +0000')
                        write_fd.write(json.dumps(tweet_item) + '\n')
                except TweepError as e:
                    log('tweepy error: {}'.format(e))

                # update the logs and send to s3
                log('counted {} objects for input {}'.format(count, user_id))
                s3.disk_2_s3(context['log'], context['s3_log'])

            log('number of inputs queried so far: {}'.format(
                num_inputs_queried))
            s3.disk_2_s3(context['log'], context['s3_log'])
Пример #8
0
    input_list = []
    _, file_extension = os.path.splitext(args.input)

    if file_extension == '.json':
        logger.info('trying json...')
        id_data = open(args.input).read()
        input_list = json.loads(id_data)
        logger.info('loaded input_list as json')
    elif file_extension == '.csv':
        logger.info('is not json, trying csv')
        csvhandle = open(args.input)
        csvreader = csv.reader(csvhandle)
        count = 0
        for row in csvreader:
            if count > 0:
                input_list.append(row[0])
            count = count + 1
        logger.info('loaded input_list as csv')

    #create the api pool
    api = TweepyPool(args.auth)

    if args.operation == 'ids_users':
        ids_to_usernames(input_list, args.output, api)
    elif args.operation == 'users_ids':
        usernames_to_ids(input_list, args.output, api)
'''
author @yvan
tweepy docs here : https://github.com/tweepy/tweepy/blob/master/tweepy/api.py#L146
'''