def __init__(self, params=None, page=None, rankRange=None): super(HSDecksSpider, self).__init__() if params == 'trending': self.start_urls = ['https://hsreplay.net/decks/trending/'] elif params == 'interrupt': self.start_urls = ['https://hsreplay.net/decks/#page=48'] elif params == 'page': # url = 'https://hsreplay.net/decks/#includedSet=YEAR_OF_THE_DRAGON&page={}'.format(page) url = 'https://hsreplay.net/decks/#playerClasses=DEMONHUNTER' # url = 'https://hsreplay.net/decks/#excludedCards=55441&includedCards=55006' self.start_urls = [url] else: # self.start_urls = ['https://hsreplay.net/decks/#playerClasses=SHAMAN&archetypes=360'] self.start_urls = [ 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND' ] # self.start_urls = ['https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=DRUID', # 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=HUNTER', # 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=PALADIN', # 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=PRIEST', # 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=SHAMAN', # 'https://hsreplay.net/decks/#rankRange=DIAMOND_THROUGH_LEGEND&timeRange=LAST_7_DAYS&playerClasses=DEMONHUNTER'] # self.start_urls = ['https://hsreplay.net/decks/#playerClasses=ROGUE&archetypes=383&rankRange=DIAMOND_THROUGH_LEGEND'] # self.start_urls = ['https://hsreplay.net/decks/#minGames=3000&includedSet=YEAR_OF_THE_DRAGON'] # self.start_urls = ['https://hsreplay.net/decks/#playerClasses=ROGUE'] # self.start_urls = ['https://hsreplay.net/decks/#playerClasses=MAGE&archetypes=393&timeRange=LAST_7_DAYS&rankRange=DIAMOND_THROUGH_LEGEND'] chrome_opt = webdriver.ChromeOptions() chrome_opt.add_argument('--disable-gpu') chrome_opt.add_argument('--no-sandbox') if platform.platform().find('Linux') != -1: chrome_opt.add_argument( 'blink-settings=imagesEnabled=false') # 无图模式 chrome_opt.add_argument('--headless') # 无页面模式 else: chrome_opt.add_argument( 'blink-settings=imagesEnabled=false') # 无图模式 self.browser = webdriver.Chrome(chrome_options=chrome_opt) dispatcher.connect( self.spider_closed, signals.spider_closed) # scrapy信号量,spider退出时关闭browser dispatcher.connect(self.engine_stopped, signals.engine_stopped) self.ifanr = iFanr() # 48到80页的数据跳过 self.interrupt_page = 80 if params == 'interrupt' else 100 # self.interrupt_page = 80 if params == 'interrupt' else 5 self.current_page = self.interrupt_page + 1 if params == 'interrupt' else 1 # 70页需要关闭chrome重新开启 self.params = params self.rankRange = rankRange self.total_page = 0 self.langToggleClicked = False self.addCookieFlag = True
def __init__(self): super(HSDecksSpider, self).__init__() chrome_opt = webdriver.ChromeOptions() chrome_opt.add_argument('--disable-gpu') chrome_opt.add_argument('--no-sandbox') if platform.platform().find('Linux') != -1: chrome_opt.add_argument('blink-settings=imagesEnabled=false') # 无图模式 chrome_opt.add_argument('--headless') # 无页面模式 else: chrome_opt.add_argument('blink-settings=imagesEnabled=false') # 无图模式 self.browser = webdriver.Chrome(chrome_options=chrome_opt) dispatcher.connect(self.spider_closed, signals.spider_closed) # scrapy信号量,spider退出时关闭browser self.ifanr = iFanr() self.current_page = 1 self.total_page = 0 self.langToggleClicked = False self.addCookieFlag = True
def __init__(self, params=None, card_hsid=None, local_update=False): super(HSArenaCardsSpider, self).__init__() self.local_update = eval(local_update) if not self.local_update: chrome_opt = webdriver.ChromeOptions() chrome_opt.add_argument('--disable-gpu') chrome_opt.add_argument('--no-sandbox') if platform.platform().find('Linux') != -1: chrome_opt.add_argument('blink-settings=imagesEnabled=false') # 无图模式 chrome_opt.add_argument('--headless') # 无页面模式 self.browser = webdriver.Chrome(chrome_options=chrome_opt) dispatcher.connect(self.spider_closed, signals.spider_closed) # scrapy信号量,spider退出时关闭browser self.ifanr = iFanr() self.total_count = 0 self.scraped_count = 0 self.temp_count = 0 self.cards_series = {} self.extra_data_flag = True if params=='extra_data' else False self.single_card = card_hsid self.addCookieFlag = True
def update_new_cards_schedule(): file = 'new_cards.html' with open(file, 'r', encoding='UTF-8') as f: text = f.read() t_selector = Selector(text=text) items = t_selector.css('div.card_revealed_item') ifanr = iFanr() tableID = ifanr.tablesID['new_cards'] for item in items: cover = item.css( 'div.card_revealed_img img::attr(src)').extract_first('') u_time = item.css('div.card_revealed_time::text').extract_first( '').strip() timestamp = int( time.mktime(time.strptime(u_time, "%Y-%m-%d %H:%M"))) utc_reveal_time = ( datetime.datetime.strptime(u_time, '%Y-%m-%d %H:%M') - datetime.timedelta(hours=8)).isoformat() print(cover, time, utc_reveal_time) data = {'cover': cover, 'reveal_time': timestamp} res = ifanr.add_table_data(tableID=tableID, data=data) print(res)
def __init__(self, faction=None): super(BestdeckSpider, self).__init__() chrome_opt = webdriver.ChromeOptions() chrome_opt.add_argument('--disable-gpu') chrome_opt.add_argument('--no-sandbox') if platform.platform().find('Linux') != -1: chrome_opt.add_argument( 'blink-settings=imagesEnabled=false') # 无图模式 chrome_opt.add_argument('--headless') # 无页面模式 else: chrome_opt.add_argument( 'blink-settings=imagesEnabled=false') # 无图模式 self.browser = webdriver.Chrome(chrome_options=chrome_opt) dispatcher.connect( self.spider_closed, signals.spider_closed) # scrapy信号量,spider退出时关闭browser dispatcher.connect(self.engine_stopped, signals.engine_stopped) # dispatcher.connect(self.item_scraped, signals.item_scraped) self.ifanr = iFanr() self.langToggleClicked = False self.addCookieFlag = True self.faction = eval(faction) if faction else None
print('update', re_dict) else: print('请传入需要筛选卡牌的dbfid') pass if res.get('meta').get('next'): page += 1 filter_decks(ifanr, query, query_card=query_card, limit=20, page=page, offset=limit * page) pass if __name__ == '__main__': ifanr = iFanr() dt = '2020-04-10 00:00:00' ts = int(time.mktime(time.strptime(dt, "%Y-%m-%d %H:%M:%S"))) query_card = [56394] query = { 'where': json.dumps({ # 'last_30_days': {'$eq': False}, # 'updated_at': {'$gt': ts}, 'card_array': { '$in': query_card } }), } filter_decks(ifanr, query, query_card)
def update_new_cards(card_list): ifanr = iFanr() tableID = ifanr.tablesID['new_cards'] file = 'new_cards.json' cardClassDict = { 'Druid': 2, 'Hunter': 3, 'Mage': 4, 'Paladin': 5, 'Priest': 6, 'Rogue': 7, 'Shaman': 8, 'Warlock': 9, 'Warrior': 10, 'Neutral': 12, 'DemonHunter': 14 } cardTypeDict = { 'MINION': 4, 'SPELL': 5, 'HERO': 3, 'HERO_POWER': 10, 'WEAPON': 7 } rarityDict = {'free': 2, 'common': 1, 'rare': 3, 'epic': 4, 'legendary': 5} raceDict = { 'ALL': 26, 'DRAGON': 24, 'DEMON': 15, 'PIRATE': 23, 'BEAST': 20, 'TOTEM': 21, 'MURLOC': 14, 'ELEMENTAL': 18, 'MECHANICAL': 17 } def format_data(list): for item in list: if item['classId'] == 1: print('aaa', item) break cardClass = [ k for k, v in cardClassDict.items() if v == item['classId'] ][0] if item.get('classId') else '' multiClass = [] if item.get('multiClassIds'): for class_id in item.get('multiClassIds'): multiClass.append([ k for k, v in cardClassDict.items() if v == class_id ][0]) else: multiClass = [[ k for k, v in cardClassDict.items() if v == item['classId'] ][0]] if item.get('classId') else [] type = [ k for k, v in cardTypeDict.items() if v == item['cardTypeId'] ][0] if item.get('cardTypeId') else '' rarity = [ k for k, v in rarityDict.items() if v == item['rarityId'] ][0] if item.get('rarityId') else '' race = [ k for k, v in raceDict.items() if v == item['minionTypeId'] ][0] if item.get('minionTypeId') else '' # try: # race = [k for k,v in raceDict.items() if v==item['minionTypeId']][0] if item.get('minionTypeId') else '' # except Exception as e: # print('aaaaaaaaaaaaaa', item, item.get('minionTypeId')) set_id = 28 if item.get('cardSetId') == 1414: set_id = 23 elif item.get('cardSetId') == 2: set_id = 1 elif item.get('cardSetId') == 1463: set_id = 24 data = { 'name': item.get('name'), 'dbfId': item.get('id'), 'cost': item.get('manaCost'), 'health': item.get('health'), 'attack': item.get('attack'), 'text': item.get('text'), 'img_card_link': item.get('image'), 'flavor': item.get('flavorText'), 'entourage': item.get('childIds'), 'cardClass': cardClass, 'multiClass': multiClass, 'collectible': item.get('collectible'), 'artist': item.get('artistName'), 'type': type, 'rarity': rarity, 'race': race, 'set_id': set_id, 'invalid': 0 } query = { 'where': json.dumps({'dbfId': { '$eq': item['id'] }}), } res = ifanr.get_table_data(tableID=tableID, query=query) if res: if (res.get('meta').get('total_count')): card = res.get('objects')[0] if res.get( 'objects') else 'not found card:%s' % item['id'] # 首批公布的卡牌,没有发布日期则直接以当天发布的时间作为发布日期 # 最后一次性发布的卡,修改他的发布时间,使其显示在最前面 # if card['created_at'] > 1605110400 and not card.get('reveal_time'): # data['reveal_time'] = 1605117600 ifanr.put_table_data(tableID=tableID, id=card['id'], data=data) print('update', res) else: res = ifanr.add_table_data(tableID=tableID, data=data) print('add', res) else: print('res is none') if card_list: format_data(card_list) else: with open(file, 'r', encoding='UTF-8') as f: list = json.load(f) format_data(list['cards'])