def run(self): while not self.stop: with self.handler.queue_lock: if len(self.handler.queue) == 0: current_user = None else: current_user = self.handler.queue.pop(0) if not current_user: time.sleep(1) else: if current_user[0] == '-': current_user = current_user[1:] check_existence = True else: check_existence = False current_url = "http://steamcommunity.com/" + current_user #html request html = ( current_user, check_existence, request_html(current_user, current_url), request_html(current_user + "/ajaxaliases", current_url + "/ajaxaliases"), ) with self.handler.html_lock: self.handler.htmls.append(html)
def dump_status(self, init = 0): starttime = time.clock() self.update_uptime(starttime) #remove older than hour data if len(self.save_times): hour_ago = time.time() - 3600 if self.save_times[-1] < hour_ago: self.save_times = [] self.save_amounts = [] else: i = 0 while self.save_times[i] < hour_ago: i+= 1 self.save_times = self.save_times[i:] self.save_amounts = self.save_amounts[i:] #json data keys = ("files", "bytes", "inittime", "total_crawls", "total_bytes", "crawl_age", "uptime") values = (len(self.save_times), sum(self.save_amounts), self.alltimestats[0], self.alltimestats[1], self.alltimestats[2], time.time() - self.alltimestats[3], self.alltimestats[4]) data = get_json(keys, values) #rest self.request_time = max(int(time.time()), self.request_time + 1) _hash = calc_hash(self.request_time) print " " + str(request_html("status dump", DATA_SAVER, get_post_values(_hash, self.request_time, "s", data, init))[-1]), print get_time_string(starttime)
def run(self): print "Really, no need for another Steam crawler" self.request_handler.start() health_check = True start_time = time.clock() queue_time = start_time dump_time = start_time - DATA_DUMP_TIME / 2 #first dump faster dump_time2 = start_time self.session_starttime = start_time self.dump_status(1) while not self.quit or not self.request_handler.done(): if self.quit: self.request_handler.stop() if len(self.games_queue): game = self.games_queue.pop() print "Crawling game: " + game html = 1 html1 = request_html("game " + game, "http://steamcommunity.com/app/" + game) if html1[0]: self.parse_game(html1[2], game) html2 = False, else: html = self.request_handler.get_html() if html != -1: html1 = html[2] html2 = html[3] self.current_user = html[0] self.current_url = "http://steamcommunity.com/" + self.current_user if html1[0]: self.parse(html1[2], html2[2] if html2[0] else None, html[1]) #stats current_time = time.time() if html != -1 and len(html1) > 1: self.save_times.append(current_time) html2_size = html2[1] if len(html2) > 1 else 0 self.save_amounts.append(html1[1] + html2_size) self.alltimestats[1]+= 1 self.alltimestats[2]+= html1[1] + html2_size #sleep end_time = time.clock() elapsed_time = end_time - start_time sleep_time = SLEEP_TIME - elapsed_time # print time until analysis time_until_analysis = DATA_DUMP_TIME - end_time + dump_time print "Analyzing in %i:%02i \r" % (time_until_analysis // 60, time_until_analysis % 60), # sleep now if sleep_time > 0: time.sleep(sleep_time) start_time = end_time + sleep_time else: start_time = end_time #performance stats if elapsed_time < ERROR_TIME and html != -1 and len(html1) > 1: self.crawl_times_sum+= elapsed_time self.crawl_times_amount+= 1 if health_check: print "Crawling succesfully" health_check = False #data save/dump and backup if end_time - dump_time > DATA_DUMP_TIME: #sync self.request_handler.stop() if self.request_handler.done(): self.dump_data() queue_time = time.clock() dump_time = queue_time dump_time2 = queue_time self.request_handler.start() elif end_time - dump_time2 > STATUS_DUMP_TIME: #status self.dump_status() dump_time2 = end_time elif current_time > self.next_backup: #backup self.save_queue() self.next_backup = backup_files() queue_time = time.clock() elif end_time - queue_time > QUEUE_SAVE_TIME: #save self.save_queue() queue_time = time.clock() if self.quit_analyze: self.dump_data() else: self.dump_status() self.save_queue() #do this last!
def synchronize(self, request_time, bg_images, game_names, hi_alias, existlist): starttime = time.clock() new_user_len = len(self.new_steamid_l) if not new_user_len: print "\n No new users, aborting analysis" return False #this should crash print "\n Analyzing data" current_time = time.time() #free memory (this will be updated later) self.found_users = set() steamid_max = SOME_STEAM_ID steamid_min = SOME_STEAM_ID total_files = 0 total_crawl_age = 0.0 general_names = ("public", "urls", "avatars", "bans", "backgrounds") general_names_private_index = 4 general = [0] * len(general_names) bg_image = dict() items = [ ] # [ [values], (best value, best id, best name), (second value, second id, second name) ] for i in range(len(self.item_names)): items.append([countmean(), [0], [0]]) common_names = (dict(), dict(), dict() ) #full names, cleaned names, words cleaned_names_ws = [ ] #storage of most common spellings for cleaned names for i in range(COMMON_NAME_AMOUNT + 1): cleaned_names_ws.append( dict()) #should be same size as common_names_rec[1] words_ws = [] #storage of most common spellings for words for i in range(COMMON_NAME_AMOUNT): words_ws.append( dict()) #should be same size as common_names_rec[2] imprt_correct = dict( ) #importance container; sent to web [(id, name, steamid) = value] recrawl_queue = [] #used to determine who to recrawl #levels and badges levels_per_badges = 0 badges_per_levels = 0 #recrawl types recrawl_hiprivate = 0 recrawl_important = 0 recrawl_noname = 0 recrawl_alias = 0 recrawl_special = 0 recrawl_name = 0 recrawl_clean = 0 recrawl_clean2 = 0 recrawl_word = 0 recrawl_word2 = 0 recrawl_bg = 0 #missing something without_name = 0 without_something = 0 if file_exists("mem/data"): shutil.copyfile("mem/data", "mem/data_temp") user_data_file = open("mem/data_temp", "rb") else: user_data_file = StringIO.StringIO("") new_data_file = open("mem/data", "wb") progress = Progress(self.users_in_database + len(self.new_steamid_l), "1/3") #analyze i = 0 index = 0 user_data = [0] * 5 read_new_users = False count_updated = 0 count_new = 0 while i < new_user_len: #construct user and determine if analyzed analyze_this = read_new_users if read_new_users: #from memory if self.new_steamid_l[i]: u = self.new_users[i] count_new += 1 progress.increment() else: analyze_this = False #already updated i += 1 else: #from database file line = user_data_file.readline() if line: user_data[index] = line index += 1 if index == 5: analyze_this = True u = User(data=user_data) uid = file_steamid_to_sid(user_data[1]) if uid in self.new_steamid_s: #updated uindex = self.new_steamid_l.index(uid) u.update(self.new_users[uindex], self.item_names, self.item_important) self.new_steamid_l[uindex] = None self.new_steamid_s.remove(uid) count_updated += 1 progress.increment() index = 0 progress.increment() else: read_new_users = True #analyze now if analyze_this: new_data_file.write(u.dump) #write total_files += 1 total_crawl_age += u.time #data general[1] += u.custom_url() general[2] += u.data[1] general[3] += u.data[2] #common names if u.name: #whole try: common_names[0][u.name] += 1 except KeyError: common_names[0][u.name] = 1 #cleaned (lower case no spaces no special) test_name = u.name_cleaned1() if len(test_name) >= MIN_COMMON_NAME: try: common_names[1][test_name] += 1 except KeyError: common_names[1][test_name] = 1 #words (lower case) for word in u.words(): if len(word) >= MIN_COMMON_WORD: test_word = word.lower() try: common_names[2][test_word] += 1 except KeyError: common_names[2][test_word] = 1 else: without_name += 1 if not u.private: #no items missing_something = 1 #general general[0] += 1 general[4] += u.data[3] #levels and badges u_l_p_b, u_b_p_l = u.levels_n_badges( self.level_index, self.badge_index) if u_l_p_b >= levels_per_badges: if u_l_p_b == levels_per_badges: l_per_b_user = ("", "Multiple users") else: levels_per_badges = u_l_p_b l_per_b_user = (u.user_id_queue, u.name) elif u_b_p_l >= badges_per_levels: if u_b_p_l == badges_per_levels: b_per_l_user = ("", "Multiple users") else: badges_per_levels = u_b_p_l b_per_l_user = (u.user_id_queue, u.name) #bg if u.bg_id(): try: bg_image[u.bg_id()] += 1 except KeyError: bg_image[u.bg_id()] = 1 #items for j in range(len(self.item_names)): if self.item_upload[j]: if self.item_important[j]: missing_something *= u.data[j + 5] items[j][0].add(u.data[j + 5]) if u.data[j + 5] >= items[j][1][0]: items[j][2] = items[j][1] items[j][1] = (u.data[j + 5], u.user_id_queue, u.name) elif u.data[j + 5] >= items[j][2][0]: items[j][2] = (u.data[j + 5], u.user_id_queue, u.name) elif u.data[j + 5] >= items[j][1][0]: items[j][1] = [u.data[j + 5]] if missing_something == 0: without_something += 1 #counts progress.clear() print "\n " + str(count_updated) + " users updated" print " " + str(count_new) + " new users" midtimes = [time.clock()] #part 1->2 progress = Progress(1, "2/3") user_data_file.close() new_data_file.close() if file_exists("mem/data_temp"): remove_file("mem/data_temp") if file_exists("mem/new_data"): remove_file("mem/new_data") #reset (and free memory) self.new_users = [] #synced self.new_steamid_l = [] #synced self.new_steamid_s = set() self.users_in_database = total_files #sort recrawled stuff early common_names = (sorted(common_names[0].items(), key=itemgetter(1))[-COMMON_NAME_AMOUNT - 1:], sorted(common_names[1].items(), key=itemgetter(1))[-COMMON_NAME_AMOUNT - 1:], sorted(common_names[2].items(), key=itemgetter(1))[-COMMON_NAME_AMOUNT:]) progress.increment() bg_image = sorted(bg_image.items(), key=itemgetter(1))[-BACKGROUND_AMOUNT - 1:] #for recrawl common_names_rec = (tuple(i[0] for i in common_names[0]), tuple(i[0] for i in common_names[1]), tuple(i[0] for i in common_names[2])) #sets common_names_rec_all = set( i.replace(' ', '').lower() for i in common_names_rec[0] + common_names_rec[1] + common_names_rec[2]) common_names_rec_all_long = set(i for i in common_names_rec_all if len(i) >= NAME_IN_NAME_MIN_LEN) bg_image_rec = set(i[0] for i in bg_image) user_data_file = open("mem/data", "rb") midtimes.append(time.clock()) #part 2->3 progress = Progress(total_files, "3/3") index = 0 for i in user_data_file: #construct user and determine if analyzed analyze_this = False user_data[index] = i index += 1 if index == 5: analyze_this = True u = User(data=user_data, save_dump=False) if user_data[0][0] == 'i': self.found_users.add(file_userid_to_sid(user_data[0])) self.found_users.add(file_steamid_to_sid(user_data[1])) index = 0 #analyze now if analyze_this: #steamid if len(str(u.steam_id_queue)) == STEAM_ID_LEN: if u.steam_id_queue > steamid_max: steamid_max = u.steam_id_queue elif u.steam_id_queue < steamid_min: steamid_min = u.steam_id_queue #importance importance = 0 if not u.private: for j in range(len(self.item_names)): importance += importance_value(u.data[j + 5], items[j][1][0], self.item_important[j]) if importance > self.importance_treshold: imprt_correct[(u.user_id_queue, u.name, u.steam_id_queue)] = importance importance = max(importance, MIN_IMPORTANCE) #full name name_is_common = u.name in common_names_rec[0] #cleaned name stuff (get common spelling) test_name = u.name_cleaned1() if test_name in common_names_rec[1]: clean_name_is_common = True clean_index = common_names_rec[1].index(test_name) try: cleaned_names_ws[clean_index][u.name_cleaned2] += 1 except KeyError: cleaned_names_ws[clean_index][u.name_cleaned2] = 1 else: clean_name_is_common = False #word stuff (get common spelling) word_is_common, word_is_common2 = False, False for word in u.words(): test_word = word.lower() #word if test_word in common_names_rec[2]: word_is_common = True word_index = common_names_rec[2].index(test_word) try: words_ws[word_index][word] += 1 except KeyError: words_ws[word_index][word] = 1 #name-word if test_word in common_names_rec_all: word_is_common2 = True #for logging purposes, nothing more: if name_is_common or clean_name_is_common or word_is_common: word_is_common2 = False #some common clean name inside the cleaned name: clean_name_is_common2 = False if not name_is_common and not clean_name_is_common and not word_is_common: for j in common_names_rec_all_long: if j in test_name: clean_name_is_common2 = True #recrawl test time_since_crawl = current_time - u.time private_hi_level = u.private and ( u.steam_id_queue in self.high_leveled or queueid_to_sid(u.user_id_queue) in self.high_leveled) and HILEVEL_TIME < time_since_crawl is_important = RECRAWL_TIME / importance < time_since_crawl has_no_name = not u.name and NAMELESS_TIME < time_since_crawl if u.private: name_change, spec_recrawl, bg_change = False, False, False else: name_change = not u.data[-1] or ALIAS_TIME / ( (float(u.data[-1]) / ALIAS_DAYS)**2) < time_since_crawl spec_recrawl = RECRAWL_TIME_SPEC < time_since_crawl and ( u.user_id_queue == l_per_b_user[0] or u.user_id_queue == b_per_l_user[0]) bg_change = BG_TIME < time_since_crawl and u.bg_id( ) in bg_image_rec if COMMON_TIME < time_since_crawl: name_recrawl = name_is_common clean_recrawl = clean_name_is_common clean_recrawl2 = clean_name_is_common2 word_recrawl = word_is_common word_recrawl2 = word_is_common2 else: name_recrawl, clean_recrawl, clean_recrawl2, word_recrawl, word_recrawl2 = False, False, False, False, False if private_hi_level or is_important or has_no_name or name_change or spec_recrawl or name_recrawl or clean_recrawl or clean_recrawl2 or word_recrawl or word_recrawl2 or bg_change: #crawl importants and special first if is_important or spec_recrawl: recrawl_queue.insert(0, u.get_recrawl_name()) else: recrawl_queue.append(u.get_recrawl_name()) if private_hi_level: recrawl_hiprivate += 1 if is_important: recrawl_important += 1 if has_no_name: recrawl_noname += 1 if name_change: recrawl_alias += 1 if spec_recrawl: recrawl_special += 1 if name_recrawl: recrawl_name += 1 if clean_recrawl: recrawl_clean += 1 if clean_recrawl2: recrawl_clean2 += 1 if word_recrawl: recrawl_word += 1 if word_recrawl2: recrawl_word2 += 1 if bg_change: recrawl_bg += 1 progress.increment() midtimes.append(time.clock()) #part 3->4 user_data_file.close() #reset high leveled self.high_leveled = set() #steamid range steamidrange = (steamid_max - steamid_min + 1) / 1000000.0 print "Steam id range: " + str(round(steamidrange, 1)) + "mil users" existcount = sum(int(i) for i in existlist) print "Approx Steam users: " + str(round(steamidrange * existcount / max(len(existlist), 1), 1)) +\ "mil (" + str(existcount) + "/" + str(len(existlist)) + ")" #users without stuff print str(without_something) + " users without some item (" +\ str(round(100.0 * without_something / general[0], 2)) + "%)" print str(without_name) + " users with no name (" +\ str(round(100.0 * without_name / total_files, 2)) + "%)" #recrawl recrawl_queue_len = len(recrawl_queue) if recrawl_queue_len: def print_recrawl_amount(amount, desc): print " " + str(amount) + " " + desc + " (" +\ str(round(100.0 * amount / recrawl_queue_len, 2)) + "%)" print "recrawling " + str(recrawl_queue_len) + " users (" +\ str(round(100.0 * recrawl_queue_len / total_files, 2)) + "%)" print_recrawl_amount(recrawl_hiprivate, "hi level private") print_recrawl_amount(recrawl_important, "important") print_recrawl_amount(recrawl_noname, "with no name") print_recrawl_amount(recrawl_alias, "for aliases") print_recrawl_amount(recrawl_special, "special") print_recrawl_amount(recrawl_name, "for name") print_recrawl_amount(recrawl_clean, "for clean name") print_recrawl_amount(recrawl_clean2, "for clean name 2") print_recrawl_amount(recrawl_word, "for words") print_recrawl_amount(recrawl_word2, "for name-words") print_recrawl_amount(recrawl_bg, "for background") #crawl some random steamids #these have added - character to indicate that they are random (there is one special procedure for these) recrawl_queue = [ "-profiles/" + str(randint(steamid_min + 1, steamid_max - 1)) for i in range(RANDOM_CRAWL) ] + recrawl_queue #sorting imprt_correct = sorted(imprt_correct.items(), key=itemgetter(1)) #important list keys, values = handle_important_list( tuple( queue_to_file(str(imprt_correct[-i - 1][0][2])) for i in range(IMPORTANT_AMOUNT * 2))) keys += [ "total", "l_p_b", "l_p_b_u", "l_p_b_n", "b_p_l", "b_p_l_u", "b_p_l_n" ] values += [ total_files, float(levels_per_badges), stringify(l_per_b_user[0]), stringify(redact_urls(l_per_b_user[1])), float(badges_per_levels), stringify(b_per_l_user[0]), stringify(redact_urls(b_per_l_user[1])) ] #general for i in range(len(general_names)): keys.append(general_names[i]) values.append( float(general[i]) / float(total_files if i < general_names_private_index else general[0]) * 100.0) #backgrounds for i in range(BACKGROUND_AMOUNT): keys.append("common_bg" + str(i)) keys.append("common_bg_a" + str(i)) keys.append("common_bg_n" + str(i)) bg_url = bg_images[bg_image[-i - 1][0] - 1] game = bgurl_to_game(bg_url) values.append(stringify(untrim_bgurl(bg_url))) values.append(bg_image[-i - 1][1]) values.append( stringify(game_names[game]) if game in game_names else '"?"') #common names for i in range(3): prefix = ("c_name", "s_name", "c_word")[i] for j in range(COMMON_NAME_AMOUNT): keys.append(prefix + str(j)) keys.append(prefix + "_a" + str(j)) #figure out cleaned name and words if i == 1: values.append( stringify( redact_urls( max(cleaned_names_ws[-j - 1].iteritems(), key=itemgetter(1))[0]))) elif i == 2: values.append( stringify( redact_urls( max(words_ws[-j - 1].iteritems(), key=itemgetter(1))[0]))) else: values.append( stringify(redact_urls(common_names[i][-j - 1][0]))) values.append(int(common_names[i][-j - 1][1])) #items data for i in range(len(self.item_names)): if self.item_upload[i]: keys.append(self.item_names[i] + "_m0") # mean keys.append(self.item_names[i] + "_m1") # median keys.append(self.item_names[i] + "_m2") # mean deviation about median keys.append(self.item_names[i] + "_m3") # relative standard deviation values.append(items[i][0].get_mean()) values.append(items[i][0].get_median()) values.append(items[i][0].get_mean_deviation()) values.append(items[i][0].get_relative_deviation()) #high alias if self.item_names[i] == "alias": hi_alias = (hi_alias * (HI_ALIAS_MULT - 1) + items[i][1][0]) / HI_ALIAS_MULT keys.append("alias_hi") values.append(hi_alias) else: #not alias prefix = self.item_names[i] + "_hi" for j in range(2): keys.append(prefix + str(j)) keys.append(prefix + "_u" + str(j)) keys.append(prefix + "_n" + str(j)) values.append(int(items[i][j + 1][0])) values.append(stringify(items[i][j + 1][1])) values.append( stringify(redact_urls(items[i][j + 1][2]))) #important people for i in range(IMPORTANT_AMOUNT): keys.append("imprtnt" + str(i)) keys.append("imprtnt_u" + str(i)) keys.append("imprtnt_n" + str(i)) values.append(imprt_correct[-i - 1][1] * 100.0 / self.items_important_amount) values.append(stringify(imprt_correct[-i - 1][0][0])) values.append(stringify(redact_urls(imprt_correct[-i - 1][0][1]))) #put data into json data = get_json(keys, values) #save db memory imprt_len = len(imprt_correct) self.importance_treshold = imprt_correct[-IMPRT_TRESHOLD_INDEX][ 1] if imprt_len >= IMPRT_TRESHOLD_INDEX else imprt_correct[0][1] save_queue([total_files, self.importance_treshold], "mem/db_mem") self.importance_treshold *= IMPRT_TRESHOLD_MULT #lower the treshold a bit to get new treshold at 1.0 print "important list length: " + str(imprt_len) print "new importance treshold: " + str( round( self.importance_treshold * 100.0 / self.items_important_amount, 2)) request_time = max(int(time.time()), request_time + 1) _hash = calc_hash(request_time) midtimes.append(time.clock()) #part 4->5 print "\n " + str( request_html("data dump", DATA_SAVER, get_post_values(_hash, request_time, "d", data))[-1]), print get_time_string(starttime, midtimes) + "\n" return request_time, recrawl_queue, total_crawl_age / total_files, tuple( i[1][0] for i in items), hi_alias
def synchronize(self, request_time, bg_images, game_names, hi_alias, existlist): starttime = time.clock() new_user_len = len(self.new_steamid_l) if not new_user_len: print "\n No new users, aborting analysis" return False #this should crash print "\n Analyzing data" current_time = time.time() #free memory (this will be updated later) self.found_users = set() steamid_max = SOME_STEAM_ID steamid_min = SOME_STEAM_ID total_files = 0 total_crawl_age = 0.0 general_names = ("public", "urls", "avatars", "bans", "backgrounds") general_names_private_index = 4 general = [0] * len(general_names) bg_image = dict() items = [] # [ [values], (best value, best id, best name), (second value, second id, second name) ] for i in range(len(self.item_names)): items.append( [countmean(), [0], [0]] ) common_names = ( dict(), dict(), dict() ) #full names, cleaned names, words cleaned_names_ws = [] #storage of most common spellings for cleaned names for i in range(COMMON_NAME_AMOUNT + 1): cleaned_names_ws.append(dict()) #should be same size as common_names_rec[1] words_ws = [] #storage of most common spellings for words for i in range(COMMON_NAME_AMOUNT): words_ws.append(dict()) #should be same size as common_names_rec[2] imprt_correct = dict() #importance container; sent to web [(id, name, steamid) = value] recrawl_queue = [] #used to determine who to recrawl #levels and badges levels_per_badges = 0 badges_per_levels = 0 #recrawl types recrawl_hiprivate = 0 recrawl_important = 0 recrawl_noname = 0 recrawl_alias = 0 recrawl_special = 0 recrawl_name = 0 recrawl_clean = 0 recrawl_clean2 = 0 recrawl_word = 0 recrawl_word2 = 0 recrawl_bg = 0 #missing something without_name = 0 without_something = 0 if file_exists("mem/data"): shutil.copyfile("mem/data", "mem/data_temp") user_data_file = open("mem/data_temp", "rb") else: user_data_file = StringIO.StringIO("") new_data_file = open("mem/data", "wb") progress = Progress(self.users_in_database + len(self.new_steamid_l), "1/3") #analyze i = 0 index = 0 user_data = [0] * 5 read_new_users = False count_updated = 0 count_new = 0 while i < new_user_len: #construct user and determine if analyzed analyze_this = read_new_users if read_new_users: #from memory if self.new_steamid_l[i]: u = self.new_users[i] count_new+= 1 progress.increment() else: analyze_this = False #already updated i+= 1 else: #from database file line = user_data_file.readline() if line: user_data[index] = line index+= 1 if index == 5: analyze_this = True u = User(data = user_data) uid = file_steamid_to_sid(user_data[1]) if uid in self.new_steamid_s: #updated uindex = self.new_steamid_l.index(uid) u.update(self.new_users[uindex], self.item_names, self.item_important) self.new_steamid_l[uindex] = None self.new_steamid_s.remove(uid) count_updated+= 1 progress.increment() index = 0 progress.increment() else: read_new_users = True #analyze now if analyze_this: new_data_file.write(u.dump) #write total_files+= 1 total_crawl_age+= u.time #data general[1]+= u.custom_url() general[2]+= u.data[1] general[3]+= u.data[2] #common names if u.name: #whole try: common_names[0][u.name]+= 1 except KeyError: common_names[0][u.name] = 1 #cleaned (lower case no spaces no special) test_name = u.name_cleaned1() if len(test_name) >= MIN_COMMON_NAME: try: common_names[1][test_name]+= 1 except KeyError: common_names[1][test_name] = 1 #words (lower case) for word in u.words(): if len(word) >= MIN_COMMON_WORD: test_word = word.lower() try: common_names[2][test_word]+= 1 except KeyError: common_names[2][test_word] = 1 else: without_name+= 1 if not u.private: #no items missing_something = 1 #general general[0]+= 1 general[4]+= u.data[3] #levels and badges u_l_p_b, u_b_p_l = u.levels_n_badges(self.level_index, self.badge_index) if u_l_p_b >= levels_per_badges: if u_l_p_b == levels_per_badges: l_per_b_user = ("", "Multiple users") else: levels_per_badges = u_l_p_b l_per_b_user = (u.user_id_queue, u.name) elif u_b_p_l >= badges_per_levels: if u_b_p_l == badges_per_levels: b_per_l_user = ("", "Multiple users") else: badges_per_levels = u_b_p_l b_per_l_user = (u.user_id_queue, u.name) #bg if u.bg_id(): try: bg_image[u.bg_id()]+= 1 except KeyError: bg_image[u.bg_id()] = 1 #items for j in range(len(self.item_names)): if self.item_upload[j]: if self.item_important[j]: missing_something*= u.data[j + 5] items[j][0].add(u.data[j + 5]) if u.data[j + 5] >= items[j][1][0]: items[j][2] = items[j][1] items[j][1] = (u.data[j + 5], u.user_id_queue, u.name) elif u.data[j + 5] >= items[j][2][0]: items[j][2] = (u.data[j + 5], u.user_id_queue, u.name) elif u.data[j + 5] >= items[j][1][0]: items[j][1] = [u.data[j + 5]] if missing_something == 0: without_something+= 1 #counts progress.clear() print "\n " + str(count_updated) + " users updated" print " " + str(count_new) + " new users" midtimes = [time.clock()] #part 1->2 progress = Progress(1, "2/3") user_data_file.close() new_data_file.close() if file_exists("mem/data_temp"): remove_file("mem/data_temp") if file_exists("mem/new_data"): remove_file("mem/new_data") #reset (and free memory) self.new_users = [] #synced self.new_steamid_l = [] #synced self.new_steamid_s = set() self.users_in_database = total_files #sort recrawled stuff early common_names = ( sorted(common_names[0].items(), key = itemgetter(1))[-COMMON_NAME_AMOUNT-1:], sorted(common_names[1].items(), key = itemgetter(1))[-COMMON_NAME_AMOUNT-1:], sorted(common_names[2].items(), key = itemgetter(1))[-COMMON_NAME_AMOUNT:]) progress.increment() bg_image = sorted(bg_image.items(), key = itemgetter(1))[-BACKGROUND_AMOUNT-1:] #for recrawl common_names_rec = ( tuple(i[0] for i in common_names[0]), tuple(i[0] for i in common_names[1]), tuple(i[0] for i in common_names[2])) #sets common_names_rec_all = set(i.replace(' ', '').lower() for i in common_names_rec[0] + common_names_rec[1] + common_names_rec[2]) common_names_rec_all_long = set(i for i in common_names_rec_all if len(i) >= NAME_IN_NAME_MIN_LEN) bg_image_rec = set(i[0] for i in bg_image) user_data_file = open("mem/data", "rb") midtimes.append(time.clock()) #part 2->3 progress = Progress(total_files, "3/3") index = 0 for i in user_data_file: #construct user and determine if analyzed analyze_this = False user_data[index] = i index+= 1 if index == 5: analyze_this = True u = User(data = user_data, save_dump = False) if user_data[0][0] == 'i': self.found_users.add(file_userid_to_sid(user_data[0])) self.found_users.add(file_steamid_to_sid(user_data[1])) index = 0 #analyze now if analyze_this: #steamid if len(str(u.steam_id_queue)) == STEAM_ID_LEN: if u.steam_id_queue > steamid_max: steamid_max = u.steam_id_queue elif u.steam_id_queue < steamid_min: steamid_min = u.steam_id_queue #importance importance = 0 if not u.private: for j in range(len(self.item_names)): importance+= importance_value(u.data[j + 5], items[j][1][0], self.item_important[j]) if importance > self.importance_treshold: imprt_correct[(u.user_id_queue, u.name, u.steam_id_queue)] = importance importance = max(importance, MIN_IMPORTANCE) #full name name_is_common = u.name in common_names_rec[0] #cleaned name stuff (get common spelling) test_name = u.name_cleaned1() if test_name in common_names_rec[1]: clean_name_is_common = True clean_index = common_names_rec[1].index(test_name) try: cleaned_names_ws[clean_index][u.name_cleaned2]+= 1 except KeyError: cleaned_names_ws[clean_index][u.name_cleaned2] = 1 else: clean_name_is_common = False #word stuff (get common spelling) word_is_common, word_is_common2 = False, False for word in u.words(): test_word = word.lower() #word if test_word in common_names_rec[2]: word_is_common = True word_index = common_names_rec[2].index(test_word) try: words_ws[word_index][word]+= 1 except KeyError: words_ws[word_index][word] = 1 #name-word if test_word in common_names_rec_all: word_is_common2 = True #for logging purposes, nothing more: if name_is_common or clean_name_is_common or word_is_common: word_is_common2 = False #some common clean name inside the cleaned name: clean_name_is_common2 = False if not name_is_common and not clean_name_is_common and not word_is_common: for j in common_names_rec_all_long: if j in test_name: clean_name_is_common2 = True #recrawl test time_since_crawl = current_time - u.time private_hi_level = u.private and (u.steam_id_queue in self.high_leveled or queueid_to_sid(u.user_id_queue) in self.high_leveled) and HILEVEL_TIME < time_since_crawl is_important = RECRAWL_TIME / importance < time_since_crawl has_no_name = not u.name and NAMELESS_TIME < time_since_crawl if u.private: name_change, spec_recrawl, bg_change = False, False, False else: name_change = not u.data[-1] or ALIAS_TIME / ((float(u.data[-1]) / ALIAS_DAYS) ** 2) < time_since_crawl spec_recrawl = RECRAWL_TIME_SPEC < time_since_crawl and (u.user_id_queue == l_per_b_user[0] or u.user_id_queue == b_per_l_user[0]) bg_change = BG_TIME < time_since_crawl and u.bg_id() in bg_image_rec if COMMON_TIME < time_since_crawl: name_recrawl = name_is_common clean_recrawl = clean_name_is_common clean_recrawl2 = clean_name_is_common2 word_recrawl = word_is_common word_recrawl2 = word_is_common2 else: name_recrawl, clean_recrawl, clean_recrawl2, word_recrawl, word_recrawl2 = False, False, False, False, False if private_hi_level or is_important or has_no_name or name_change or spec_recrawl or name_recrawl or clean_recrawl or clean_recrawl2 or word_recrawl or word_recrawl2 or bg_change: #crawl importants and special first if is_important or spec_recrawl: recrawl_queue.insert(0, u.get_recrawl_name()) else: recrawl_queue.append(u.get_recrawl_name()) if private_hi_level: recrawl_hiprivate+= 1 if is_important: recrawl_important+= 1 if has_no_name: recrawl_noname+= 1 if name_change: recrawl_alias+= 1 if spec_recrawl: recrawl_special+= 1 if name_recrawl: recrawl_name+= 1 if clean_recrawl: recrawl_clean+= 1 if clean_recrawl2: recrawl_clean2+= 1 if word_recrawl: recrawl_word+= 1 if word_recrawl2: recrawl_word2+= 1 if bg_change: recrawl_bg+= 1 progress.increment() midtimes.append(time.clock()) #part 3->4 user_data_file.close() #reset high leveled self.high_leveled = set() #steamid range steamidrange = (steamid_max - steamid_min + 1) / 1000000.0 print "Steam id range: " + str(round(steamidrange, 1)) + "mil users" existcount = sum(int(i) for i in existlist) print "Approx Steam users: " + str(round(steamidrange * existcount / max(len(existlist), 1), 1)) +\ "mil (" + str(existcount) + "/" + str(len(existlist)) + ")" #users without stuff print str(without_something) + " users without some item (" +\ str(round(100.0 * without_something / general[0], 2)) + "%)" print str(without_name) + " users with no name (" +\ str(round(100.0 * without_name / total_files, 2)) + "%)" #recrawl recrawl_queue_len = len(recrawl_queue) if recrawl_queue_len: def print_recrawl_amount(amount, desc): print " " + str(amount) + " " + desc + " (" +\ str(round(100.0 * amount / recrawl_queue_len, 2)) + "%)" print "recrawling " + str(recrawl_queue_len) + " users (" +\ str(round(100.0 * recrawl_queue_len / total_files, 2)) + "%)" print_recrawl_amount(recrawl_hiprivate, "hi level private") print_recrawl_amount(recrawl_important, "important") print_recrawl_amount(recrawl_noname, "with no name") print_recrawl_amount(recrawl_alias, "for aliases") print_recrawl_amount(recrawl_special, "special") print_recrawl_amount(recrawl_name, "for name") print_recrawl_amount(recrawl_clean, "for clean name") print_recrawl_amount(recrawl_clean2, "for clean name 2") print_recrawl_amount(recrawl_word, "for words") print_recrawl_amount(recrawl_word2, "for name-words") print_recrawl_amount(recrawl_bg, "for background") #crawl some random steamids #these have added - character to indicate that they are random (there is one special procedure for these) recrawl_queue = ["-profiles/" + str(randint(steamid_min + 1, steamid_max - 1)) for i in range(RANDOM_CRAWL)] + recrawl_queue #sorting imprt_correct = sorted(imprt_correct.items(), key = itemgetter(1)) #important list keys, values = handle_important_list( tuple(queue_to_file(str(imprt_correct[-i - 1][0][2])) for i in range(IMPORTANT_AMOUNT * 2))) keys+= ["total", "l_p_b", "l_p_b_u", "l_p_b_n", "b_p_l", "b_p_l_u", "b_p_l_n"] values+= [total_files, float(levels_per_badges), stringify(l_per_b_user[0]), stringify(redact_urls(l_per_b_user[1])), float(badges_per_levels), stringify(b_per_l_user[0]), stringify(redact_urls(b_per_l_user[1]))] #general for i in range(len(general_names)): keys.append(general_names[i]) values.append(float(general[i]) / float(total_files if i < general_names_private_index else general[0]) * 100.0) #backgrounds for i in range(BACKGROUND_AMOUNT): keys.append("common_bg" + str(i)) keys.append("common_bg_a" + str(i)) keys.append("common_bg_n" + str(i)) bg_url = bg_images[bg_image[-i - 1][0] - 1] game = bgurl_to_game(bg_url) values.append(stringify(untrim_bgurl(bg_url))) values.append(bg_image[-i - 1][1]) values.append(stringify(game_names[game]) if game in game_names else '"?"') #common names for i in range(3): prefix = ("c_name", "s_name", "c_word")[i] for j in range(COMMON_NAME_AMOUNT): keys.append(prefix + str(j)) keys.append(prefix + "_a" + str(j)) #figure out cleaned name and words if i == 1: values.append(stringify(redact_urls(max(cleaned_names_ws[-j - 1].iteritems(), key = itemgetter(1))[0]))) elif i == 2: values.append(stringify(redact_urls(max(words_ws[-j - 1].iteritems(), key = itemgetter(1))[0]))) else: values.append(stringify(redact_urls(common_names[i][-j - 1][0]))) values.append(int(common_names[i][-j - 1][1])) #items data for i in range(len(self.item_names)): if self.item_upload[i]: keys.append(self.item_names[i] + "_m0") # mean keys.append(self.item_names[i] + "_m1") # median keys.append(self.item_names[i] + "_m2") # mean deviation about median keys.append(self.item_names[i] + "_m3") # relative standard deviation values.append(items[i][0].get_mean()) values.append(items[i][0].get_median()) values.append(items[i][0].get_mean_deviation()) values.append(items[i][0].get_relative_deviation()) #high alias if self.item_names[i] == "alias": hi_alias = (hi_alias * (HI_ALIAS_MULT - 1) + items[i][1][0]) / HI_ALIAS_MULT keys.append("alias_hi") values.append(hi_alias) else: #not alias prefix = self.item_names[i] + "_hi" for j in range(2): keys.append(prefix + str(j)) keys.append(prefix + "_u" + str(j)) keys.append(prefix + "_n" + str(j)) values.append(int(items[i][j + 1][0])) values.append(stringify(items[i][j + 1][1])) values.append(stringify(redact_urls(items[i][j + 1][2]))) #important people for i in range(IMPORTANT_AMOUNT): keys.append("imprtnt" + str(i)) keys.append("imprtnt_u" + str(i)) keys.append("imprtnt_n" + str(i)) values.append(imprt_correct[-i - 1][1] * 100.0 / self.items_important_amount) values.append(stringify(imprt_correct[-i - 1][0][0])) values.append(stringify(redact_urls(imprt_correct[-i - 1][0][1]))) #put data into json data = get_json(keys, values) #save db memory imprt_len = len(imprt_correct) self.importance_treshold = imprt_correct[-IMPRT_TRESHOLD_INDEX][1] if imprt_len >= IMPRT_TRESHOLD_INDEX else imprt_correct[0][1] save_queue([total_files, self.importance_treshold], "mem/db_mem") self.importance_treshold*= IMPRT_TRESHOLD_MULT #lower the treshold a bit to get new treshold at 1.0 print "important list length: " + str(imprt_len) print "new importance treshold: " + str(round(self.importance_treshold * 100.0 / self.items_important_amount, 2)) request_time = max(int(time.time()), request_time + 1) _hash = calc_hash(request_time) midtimes.append(time.clock()) #part 4->5 print "\n " + str(request_html("data dump", DATA_SAVER, get_post_values(_hash, request_time, "d", data))[-1]), print get_time_string(starttime, midtimes) + "\n" return request_time, recrawl_queue, total_crawl_age / total_files, tuple(i[1][0] for i in items), hi_alias