def get_user_info(user_id, token, logfilename): logger_obj = Logger(log_name=' USER INFO THREAD', log_filename=logfilename) status, user_info_str, info = request_data(_4SQ_USER_URL % (user_id, token)) if (status == 500) or (status == 404): logger_obj.put_message('error', str(status) + ' - %s' % user_id) return status, None, 0 elif status == 200: logger_obj.put_message('debug', '200 - %s' % user_id) user_info = dict() try: user_info = json.loads(user_info_str)['response']['user'] user_info['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) return status, user_info, 1 except: return status, user_info, 1 else: logger_obj.put_message('critical', 'Other error - %s' % user_id) logger_obj.put_message('critical', status) return status, None, 1
def main(process_name): ids_filename = sys.argv[2] output_filename_user_info = sys.argv[3] + process_name + '_user_info' + '.txt' output_filename_list_dones = sys.argv[3] + process_name + '_list_dones' + '.txt' output_filename_user_friends = sys.argv[3] + process_name + '_user_friends' + '.txt' output_filename_user_followers = sys.argv[3] + process_name + '_user_followers' + '.txt' credentials_file = sys.argv[4] logfilename = '' if socket.gethostname() in ['mummra', 'gargamel', 'esqueleto', 'cavernoso']: logfilename = '/data/users/saulomrr/_4sq_crawler/log/' + process_name + '.log' else: logfilename = '/var/tmp/_4sq_crawler/log/' + process_name + '.log' credentials_dict = read_credentials_file(credentials_file) ids = True logger_obj = Logger(log_name=' MAIN THREAD', log_filename=logfilename) while ids: sum_time = 0 ids = load_ids(ids_filename) if not ids: break pos_id = int(len(ids) * random.random()) id = ids[pos_id] # pre_result = actual_id(id) # if pre_result[0] == 200: # try: # Thread array to crawl basic information threaded_array = [] threaded_array.append(threaded_4sq_crawler(id, 'user_info', {'token':credentials_dict['token_user_info']}, logfilename)) threaded_array.append(threaded_4sq_crawler(id, 'list_dones', {'token':credentials_dict['token_user_tips']}, logfilename)) threaded_array.append(threaded_4sq_crawler(id, 'user_friends', {'token':credentials_dict['token_user_friends']}, logfilename)) threaded_array.append(threaded_4sq_crawler(id, 'user_followers', {'token':credentials_dict['token_user_followers']}, logfilename)) for thread in threaded_array: thread.start() for thread in threaded_array: thread.join() if (threaded_array[0].status == 200) and (threaded_array[1].status == 200) and (threaded_array[2].status == 200) and (threaded_array[3].status == 200): if threaded_array[0].result: time1 = datetime.datetime.now() write_data(output_filename_user_info, id.strip(), json.dumps(threaded_array[0].result)) time2 = datetime.datetime.now() sum_time += (time2 - time1).seconds + (time2-time1).microseconds/float(1000000) if threaded_array[1].result is not None: time1 = datetime.datetime.now() write_data(output_filename_list_dones, id.strip(), json.dumps(threaded_array[1].result)) time2 = datetime.datetime.now() sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000) if threaded_array[2].result is not None: time1 = datetime.datetime.now() write_data(output_filename_user_friends, id.strip(), json.dumps(threaded_array[2].result)) time2 = datetime.datetime.now() sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000) if threaded_array[3].result is not None: time1 = datetime.datetime.now() write_data(output_filename_user_followers, id.strip(), json.dumps(threaded_array[3].result)) time2 = datetime.datetime.now() sum_time += (time2 - time1).seconds + (time2 - time1).microseconds/float(1000000) logger_obj.put_message('debug', 'CRAWLED - %s' % id) id = ids.pop(pos_id) update_ids_file(ids_filename, ids) list_pages = map(lambda x: x.amount_pages, threaded_array) maximum_amount_pages = max(list_pages) sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time if sleep_time_overall > 0: time.sleep(sleep_time_overall) elif threaded_array[0].status == 404 or threaded_array[0].status == 400: logger_obj.put_message('debug', 'NOT EXISTS - %s' % id) id = ids.pop(pos_id) update_ids_file(ids_filename, ids) list_pages = map(lambda x: x.amount_pages, threaded_array) maximum_amount_pages = max(list_pages) sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time if sleep_time_overall > 0: time.sleep(sleep_time_overall) else: logger_obj.put_message('critical', 'ERROR - GONNA RETRY %s' % id) list_pages = map(lambda x: x.amount_pages, threaded_array) maximum_amount_pages = max(list_pages) sleep_time_overall = TIME_INTERVAL_USERS*maximum_amount_pages + 2*random.random() - sum_time if sleep_time_overall > 0: time.sleep(sleep_time_overall) break
def get_user_dones_list(user_id, token, logfilename): requester_obj = requester() dones_list = [] offset = 0 dones_count = 0 page_count = 0 logger_obj = Logger(log_name=' LIST DONES THREAD', log_filename=logfilename) status, dones_str, info = requester_obj.get_response(_4SQ_USER_LIST_DONES_URL % (user_id, token, str(offset))) if (status == 500) or (status == 404): logger_obj.put_message('error', str(status) + ' - %s' % user_id) return status, None, page_count elif status == 200: page_count += 1 dones_obj = json.loads(dones_str) dones_count = int(dones_obj['response']['list']['listItems']['count']) if dones_obj['response']['list']['listItems']['items']: dones_list += dones_obj['response']['list']['listItems']['items'] else: for done in dones_list: done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, dones_list, page_count offset += len(dones_obj['response']['list']['listItems']['items']) while offset < dones_count: status, dones_str, info = requester_obj.get_response(_4SQ_USER_LIST_DONES_URL % (user_id, token, str(offset))) page_count += 1 if (status == 500) or (status == 404): logger_obj.put_message('error', str(status) + ' - %s' % user_id) return status, None, page_count elif status == 200: dones_obj = json.loads(dones_str) try: if dones_obj['response']['list']['listItems']['items']: dones_list += dones_obj['response']['list']['listItems']['items'] else: for done in dones_list: done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, dones_list, page_count except: for done in dones_list: done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, dones_list, page_count else: logger_obj.put_message('critical', 'Other error - %s' % user_id) logger_obj.put_message('critical', status) return status, None, page_count offset += len(dones_obj['response']['list']['listItems']['items']) for done in dones_list: done['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, dones_list, page_count else: logger_obj.put_message('critical', 'Other error - %s' % user_id) logger_obj.put_message('critical', status) return status, None, page_count
def get_user_followers(user_id, token, logfilename): follower_list = [] offset = 0 followers_count = 0 page_count = 0 logger_obj = Logger(log_name=' LIST FOLLOWERS THREAD', log_filename=logfilename) status, followers_str, info = request_data(_4SQ_USER_FOLLOWERS_URL % (user_id, token, str(offset))) if (status == 500) or (status == 404): logger_obj.put_message('error', str(status) + ' - %s' % user_id) return status, None, page_count elif status == 200: page_count += 1 followers_obj = json.loads(followers_str) followers_count = int(followers_obj['response']['followers']['count']) if followers_obj['response']['followers']['items']: follower_list += followers_obj['response']['followers']['items'] else: for follower in follower_list: follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, follower_list, page_count offset += len(followers_obj['response']['followers']['items']) while offset < followers_count: status, followers_str, info = request_data(_4SQ_USER_FOLLOWERS_URL % (user_id, token, str(offset))) page_count += 1 if (status == 500) or (status == 404): logger_obj.put_message('error', str(status) + ' - %s' % user_id) return status, None, page_count elif status == 200: followers_obj = json.loads(followers_str) try: if followers_obj['response']['followers']['items']: follower_list += followers_obj['response']['followers']['items'] else: for follower in follower_list: follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, follower_list, page_count except: for follower in follower_list: follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, follower_list, page_count offset += len(followers_obj['response']['followers']['items']) else: logger_obj.put_message('critical', 'Other error - %s' % user_id) logger_obj.put_message('critical', status) return status, None, page_count for follower in follower_list: follower['current_time'] = str(int(time.mktime(datetime.datetime.now().timetuple()))) logger_obj.put_message('debug', '200 - %s' % user_id) return status, follower_list, page_count else: logger_obj.put_message('critical', 'Other error - %s' % user_id) logger_obj.put_message('critical', status) return status, None, page_count