Пример #1
0
    def __init__(self):
        starttime = time.clock()
        super(crawl, self).__init__()
        print "    Welcome to the crawler version " + VERSION + "\n"
        for i in USER_AGENT.split():
            print i + "\n ",
        print "\nInitializing"

        self.quit = True
        self.quit_analyze = True
        self.request_time = 0

        #alias should be the last one
        #if not, recrawl will no longer work for alias
        self.item_names = ("level",
                           "badge",
                           "game",
                           "screenshot",
                           "video",
                           "workshop",
                           "recommendation",
                           "guide",
                           "image",
                           "greenlight",
                           "item",
                           "group",
                           "friend",
                           "alias",
                           )
        self.item_search = (r"badges/",
                            r"games/\?tab=all",
                            r"screenshots/",
                            r"videos/",
                            r"myworkshopfiles/",
                            r"recommended/",
                            r"myworkshopfiles/\?section=guides",
                            r"images/",
                            r"myworkshopfiles/\?section=greenlight",
                            r"inventory/",
                            r"groups/",
                            r"friends/",
                            )
        self.item_important = (True,  #level
                               False, #badge
                               True,  #game
                               False, #screenshot
                               False, #video
                               False, #workshop
                               False, #recommendation
                               False, #guide
                               False, #image
                               False, #greenlight
                               False, #item
                               True,  #group
                               True,  #friend
                               False, #alias
                               )
        self.item_upload = list(self.item_important)
        self.item_upload[-1] = True #alias
        self.item_upload = tuple(self.item_upload)

        if not file_exists("mem/stats"): print "\nRUNNING THE CRAWLER FOR THE FIRST TIME\n"
        # [start time, crawls, bytes, crawl age, uptime, hi alias]
        self.alltimestats = load_queue("mem/stats", [time.time(), 0.0, 0.0, time.time(), 0.0, 0.0])
        self.queue = load_queue("mem/queue", [FIRST_USER], file_to_queue)
        self.hiscores = load_queue("mem/high", [1] * len(self.item_names), int)
        self.save_times = load_queue("mem/times", [])
        self.save_amounts = load_queue("mem/bytes", [])
        self.bg_images = load_queue("mem/backgrounds", [], file_to_bgurl)
        self.uptime = self.alltimestats[4]
        if file_exists("mem/exists"):
            with open("mem/exists", "rb") as f: self.existlist = f.read()
        else: self.existlist = ""

        self.games = load_dict("mem/games", {})
        self.games_queue = []
        for i in self.bg_images:
            game = bgurl_to_game(i)
            if game not in self.games and game not in self.games_queue: self.games_queue.append(game)

        #regexes and search strings
            #public
        self.re_name = re.compile(r'"personaname":"([^"]*)') # 1
        self.re_steamid = re.compile(r'"steamid":"([^"]*)') # 1
        self.re_customurl = re.compile(r'"url":"([^"]*)') # 1
        self.se_private = "private_profile"
        self.se_noavatar = "fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_full.jpg"
        self.se_bans = "profile_ban"
            #private
        self.se_background = "has_profile_background"
        self.re_bgimage = re.compile(r"background-image: url\(( ')?([^')]*)") # 2
        self.re_friends      = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)') # 1
        self.re_friend_level = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)[\D]*([\d]*)') # 1 + 3
        self.re_level = re.compile(r'"friendPlayerLevelNum">(\d*)') # 1
            #positions
        self.se_comments = "profile_comment_area"
        self.se_leftcol = "profile_leftcol"
        self.se_rightcol = "profile_rightcol"
        self.se_topfriends = "profile_topfriends"
            #game
        self.re_game = re.compile(r'apphub_AppName[^>]*>([^<]*)') # 1

        #performance stats
        self.crawl_times_sum = 0
        self.crawl_times_amount = 0

        self.request_handler = Request_handler(self.queue)

        self.database = Database(self.item_names, self.item_important, self.item_upload)

        self.next_backup = get_next_backup_time()

        print "  " + str(len(self.queue)) + " users in queue"
        print "  " + str(len(self.bg_images)) + " backgrounds found"
        print "  " + str(len(self.games)) + " games crawled"
        print "  " + str(round(self.uptime / 86400.0, 1)) + " days of crawling time"
        print "Next backup in " + str(round((self.next_backup - time.time()) / 3600.0, 1)) + " hours"
        print "Done initializing",
        print get_time_string(starttime)