def get_user_info(user_id, token, logfilename):
    logger_obj = Logger(log_name=' USER INFO THREAD', log_filename=logfilename)
    status, user_info_str, info = request_data(_4SQ_USER_URL % (user_id, token))

    if (status == 500) or (status == 404):
        logger_obj.put_message('error', str(status) + ' - %s' % user_id)
        return status, None, 0

    elif status == 200:
        logger_obj.put_message('debug', '200 - %s' % user_id)
        user_info = dict()
        try:
            user_info = json.loads(user_info_str)['response']['user']
            user_info['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
            return status, user_info, 1
        except:
            return status, user_info, 1
    else:
        logger_obj.put_message('critical', 'Other error - %s' % user_id)
        logger_obj.put_message('critical', status)
        return status, None, 1 
Пример #2
0
def main(process_name):
    ids_filename = sys.argv[2]

    output_filename_user_info = sys.argv[3] + process_name + '_user_info' + '.txt'
    output_filename_list_dones = sys.argv[3] + process_name + '_list_dones' + '.txt'
    output_filename_user_friends = sys.argv[3] + process_name + '_user_friends' + '.txt'
    output_filename_user_followers = sys.argv[3] + process_name + '_user_followers' + '.txt'

    credentials_file = sys.argv[4]

    logfilename = ''
    if socket.gethostname() in ['mummra', 'gargamel', 'esqueleto', 'cavernoso']:
        logfilename = '/data/users/saulomrr/_4sq_crawler/log/' + process_name + '.log'
    else:
        logfilename = '/var/tmp/_4sq_crawler/log/' + process_name + '.log'

    credentials_dict = read_credentials_file(credentials_file)
    ids = True
    logger_obj = Logger(log_name=' MAIN THREAD', log_filename=logfilename)
    while ids:
        sum_time = 0
        ids = load_ids(ids_filename)
        if not ids:
            break
        pos_id = int(len(ids) * random.random())
        id = ids[pos_id]
#        pre_result = actual_id(id)
#        if pre_result[0] == 200:
#            try:
                # Thread array to crawl basic information
        threaded_array = []
        threaded_array.append(threaded_4sq_crawler(id, 'user_info', {'token':credentials_dict['token_user_info']}, logfilename))
        threaded_array.append(threaded_4sq_crawler(id, 'list_dones', {'token':credentials_dict['token_user_tips']}, logfilename))
        threaded_array.append(threaded_4sq_crawler(id, 'user_friends', {'token':credentials_dict['token_user_friends']}, logfilename))
        threaded_array.append(threaded_4sq_crawler(id, 'user_followers', {'token':credentials_dict['token_user_followers']}, logfilename))
                
        for thread in threaded_array:
            thread.start()
        for thread in threaded_array:
            thread.join()

        if (threaded_array[0].status == 200) and (threaded_array[1].status == 200) and (threaded_array[2].status == 200) and (threaded_array[3].status == 200):
            if threaded_array[0].result:
                time1 = datetime.datetime.now()
                write_data(output_filename_user_info, id.strip(), json.dumps(threaded_array[0].result))
                time2 = datetime.datetime.now()
                sum_time += (time2 - time1).seconds + (time2-time1).microseconds/float(1000000)

            if threaded_array[1].result is not None:
                time1 = datetime.datetime.now()
                write_data(output_filename_list_dones, id.strip(), json.dumps(threaded_array[1].result))
                time2 = datetime.datetime.now()
                sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000)

            if threaded_array[2].result is not None:
                time1 = datetime.datetime.now()
                write_data(output_filename_user_friends, id.strip(), json.dumps(threaded_array[2].result))
                time2 = datetime.datetime.now()
                sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000)

            if threaded_array[3].result is not None:
                time1 = datetime.datetime.now()
                write_data(output_filename_user_followers, id.strip(), json.dumps(threaded_array[3].result))
                time2 = datetime.datetime.now()
                sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000)
                    
                logger_obj.put_message('debug', 'CRAWLED - %s' % id)
                id = ids.pop(pos_id)
                update_ids_file(ids_filename, ids)
                list_pages = map(lambda x: x.amount_pages, threaded_array)
                maximum_amount_pages = max(list_pages)

                sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time
                if sleep_time_overall > 0:
                    time.sleep(sleep_time_overall)

        elif threaded_array[0].status == 404 or threaded_array[0].status == 400:
            logger_obj.put_message('debug', 'NOT EXISTS - %s' % id)
            id = ids.pop(pos_id)
            update_ids_file(ids_filename, ids)

            list_pages = map(lambda x: x.amount_pages, threaded_array)
            maximum_amount_pages = max(list_pages)

            sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time
            if sleep_time_overall > 0:
                time.sleep(sleep_time_overall)

        else:
            logger_obj.put_message('critical', 'ERROR - GONNA RETRY %s' % id)
            list_pages = map(lambda x: x.amount_pages, threaded_array)
            maximum_amount_pages = max(list_pages)

            sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time
            if sleep_time_overall > 0:
                time.sleep(sleep_time_overall)
            break
def get_user_dones_list(user_id, token, logfilename):
    requester_obj = requester()
    dones_list = []
    offset = 0
    dones_count = 0
    page_count = 0

    logger_obj = Logger(log_name=' LIST DONES THREAD', log_filename=logfilename)
    status, dones_str, info = requester_obj.get_response(_4SQ_USER_LIST_DONES_URL % (user_id, token, str(offset)))

    if (status == 500) or (status == 404):
        logger_obj.put_message('error', str(status) + ' - %s' % user_id)
        return status, None, page_count

    elif status == 200:
        page_count += 1
        dones_obj = json.loads(dones_str)
        dones_count = int(dones_obj['response']['list']['listItems']['count'])
        if dones_obj['response']['list']['listItems']['items']:
            dones_list += dones_obj['response']['list']['listItems']['items']
        else:
            for done in dones_list:
                done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))

            logger_obj.put_message('debug', '200 - %s' % user_id)
            return status, dones_list, page_count

        offset += len(dones_obj['response']['list']['listItems']['items'])
        while offset < dones_count:
            status, dones_str, info = requester_obj.get_response(_4SQ_USER_LIST_DONES_URL % (user_id, token, str(offset)))
            page_count += 1
            if (status == 500) or (status == 404):
                logger_obj.put_message('error', str(status) + ' - %s' % user_id)
                return status, None, page_count
            
            elif status == 200:
                dones_obj = json.loads(dones_str)
                try:
                    if dones_obj['response']['list']['listItems']['items']:
                        dones_list += dones_obj['response']['list']['listItems']['items']
                    else:
                        for done in dones_list:
                            done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
                        logger_obj.put_message('debug', '200 - %s' % user_id)
                        return status, dones_list, page_count
                except:
                    for done in dones_list:
                        done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
                    logger_obj.put_message('debug', '200 - %s' % user_id)
                    return status, dones_list, page_count

            else:
                logger_obj.put_message('critical', 'Other error - %s' % user_id)
                logger_obj.put_message('critical', status)
                return status, None, page_count

            offset += len(dones_obj['response']['list']['listItems']['items'])

        for done in dones_list:
            done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
        logger_obj.put_message('debug', '200 - %s' % user_id)
        return status, dones_list, page_count
    else:
        logger_obj.put_message('critical', 'Other error - %s' % user_id)
        logger_obj.put_message('critical', status)
        return status, None, page_count
def get_user_followers(user_id, token, logfilename):
    follower_list = []
    offset = 0
    followers_count = 0
    page_count = 0
    logger_obj = Logger(log_name=' LIST FOLLOWERS THREAD', log_filename=logfilename)
    status, followers_str, info = request_data(_4SQ_USER_FOLLOWERS_URL % (user_id, token, str(offset)))
    
    if (status == 500) or (status == 404):
        logger_obj.put_message('error', str(status) + ' - %s' % user_id)
        return status, None, page_count

    elif status == 200:
        page_count += 1
        followers_obj = json.loads(followers_str)
        followers_count = int(followers_obj['response']['followers']['count'])
        if followers_obj['response']['followers']['items']:
            follower_list += followers_obj['response']['followers']['items']
        else:
            for follower in follower_list:
                follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
            logger_obj.put_message('debug', '200 - %s' % user_id)
            return status, follower_list, page_count

        offset += len(followers_obj['response']['followers']['items'])
        while offset < followers_count:
            status, followers_str, info = request_data(_4SQ_USER_FOLLOWERS_URL % (user_id, token, str(offset)))
            page_count += 1
            
            if (status == 500) or (status == 404):
                logger_obj.put_message('error', str(status) + ' - %s' % user_id)
                return status, None, page_count
            
            elif status == 200:
                followers_obj = json.loads(followers_str)
                try:
                    if followers_obj['response']['followers']['items']:
                        follower_list += followers_obj['response']['followers']['items']
                    else:
                        for follower in follower_list:
                            follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
                        logger_obj.put_message('debug', '200 - %s' % user_id)
                        return status, follower_list, page_count
                except:
                    for follower in follower_list:
                        follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
                    logger_obj.put_message('debug', '200 - %s' % user_id)
                    return status, follower_list, page_count
                
                offset += len(followers_obj['response']['followers']['items'])

            else:
                logger_obj.put_message('critical', 'Other error - %s' % user_id)
                logger_obj.put_message('critical', status)
                return status, None, page_count

        for follower in follower_list:
            follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple())))
        logger_obj.put_message('debug', '200 - %s' % user_id)
        return status, follower_list, page_count

    else:
        logger_obj.put_message('critical', 'Other error - %s' % user_id)
        logger_obj.put_message('critical', status)
        return status, None, page_count