async def scrape(cls): # inits CACHE = utils.load_json_with_default(utils.SUPER_CACHE_FILE, default=cls.DEFAULT_CACHE) async with get_session() as session: # check for new auctions home_html = await get_html(cls.HOME_BASE_LINK, session) home_soup = BeautifulSoup(home_html, 'html.parser') rows = home_soup.find("tbody").find_all("tr") auc_names = [ r.find("a", href=lambda x: x and "itemlist" in x)['href'] for r in rows ] auc_nums = [r.find("td").get_text().zfill(3) for r in rows] auc_dates = [r.find_all("td")[1].get_text() for r in rows] auc_dates = [cls._to_epoch(x) for x in auc_dates] assert len(auc_names) == len(auc_nums) == len(auc_dates) # get uncached pages new_aucs = [] for i in range(len(rows)): if auc_names[i] not in CACHE['seen']: new_aucs.append((auc_nums[i], auc_names[i], auc_dates[i])) # create folder for auction page html if not os.path.exists(utils.SUPER_HTML_DIR): os.makedirs(utils.SUPER_HTML_DIR) # pull uncached pages for num, name, date in new_aucs: out_path = utils.SUPER_HTML_DIR + name + ".html" if not os.path.exists(out_path): await asyncio.sleep(cls.SCRAPE_DELAY) auc_html = await get_html(cls.HOME_BASE_LINK + name, session) if "Auction ended" not in auc_html: continue # ignore ongoing with open(out_path, "w", encoding='utf-8') as f: f.write(auc_html) tmp = name.replace("itemlist", "") CACHE['seen'].append(name) CACHE['num_map'][tmp] = num CACHE['time_map'][tmp] = date # update cache if new_aucs: CACHE['seen'].sort(reverse=True) utils.dump_json(CACHE, utils.SUPER_CACHE_FILE) # true if new auctions found return bool(new_aucs)
async def scrape_ranges(cls): async with get_session() as session: html = await get_html(cls.DATA_LINK, session) soup = BeautifulSoup(html, 'html.parser') json_string = soup.find( lambda x: 'data-itemranges' in x.attrs)['data-itemranges'] data = json.loads(json_string) utils.dump_json(data, utils.RANGES_FILE) return data
async def get_updates(self): session = get_session() try: updates = self.parse_update_page(session) async for x in (self.filter_updates(updates, session)): yield self.format_update(x) except Exception as e: print(e, file=sys.stderr) await session.close()
async def get_updates(cls): session = get_session() lst = await cls.fetch_updates(session) lst = cls.filter_updates(lst) for x in lst: x['cover_link'] = await cls.get_cover_link(x, session) yield cls.format_update(x) await asyncio.sleep(3) await session.close()
async def searchLottoItems(item): import utils from utils.scraper_utils import get_session lst = [] split = item.split() data = utils.load_json_with_default(utils.DATA_DIR + "lotto_data.json", default={ "w": {}, "a": {} }) numW = max([int(x) for x in data["w"].keys()]) if data['w'] else 0 numA = max([int(x) for x in data["a"].keys()]) if data['a'] else 0 if weaponLotteryOutdated(numW) or armorLotteryOutdated(numA): print("OUTDATED", numW, numA) await lotto_dl(get_session()) await lotto_parse() data = utils.load_json_with_default(utils.DATA_DIR + "lotto_data.json", default={ "w": {}, "a": {} }) for type in data: ks = list(data[type].keys()) ks.sort(key=lambda x: -int(x)) for num in ks: ent = data[type][num] if all([x in ent["eq"].lower() for x in split]): date = f"#{num} / {ent['date'][0]} {ent['date'][1]}{ent['date'][2]}" try: winner = ent['winners'][0] except Exception as e: print(num, type, ent) tix = formatPrice(ent["tickets"]) item = ent["eq"] print(date, winner, tix) lst.append([item, winner, tix, date]) return lst
async def _do_login(session=None): CONFIG = utils.load_bot_config() if session is None: session = get_session() # from chrome's network tab headers = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Origin': 'https://e-hentai.org', 'Upgrade-Insecure-Requests': '1', 'DNT': '1', 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Sec-Fetch-Site': 'same-site', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-User': '******', 'Sec-Fetch-Dest': 'document', 'Referer': 'https://e-hentai.org/', 'Accept-Language': 'en-US,en;q=0.9', } params = ( ('act', 'Login'), ('CODE', '01'), ) data = { 'CookieDate': '1', 'b': 'd', 'bt': '1-5', 'UserName': CONFIG['eh_username'], 'PassWord': CONFIG['eh_password'], 'ipb_login_submit': 'Login!' } await session.post(r"https://forums.e-hentai.org/index.php", headers=headers, params=params, data=data) return session
async def scrape(cls): async with get_session() as session: # get search results page search_link= None while search_link is None: async with session.post(cls.THREAD_SEARCH['link'], data=cls.THREAD_SEARCH['payload']) as resp: soup= BeautifulSoup(await resp.text(), 'html.parser') tmp= soup.find(class_="redirectfoot") if tmp: search_link= tmp.find("a")['href'] else: print("kedama: rate limit") await asyncio.sleep(30) # wait for a while if hit rate limit # get thread links await asyncio.sleep(cls.DELAY) html= await get_html(search_link, session); print("kedama: getting",search_link) soup= BeautifulSoup(html, 'html.parser') num_pages= soup.find(id=lambda y: y and "page-jump" in y) num_pages= num_pages.get_text().replace(" Pages", "") num_pages= int(num_pages) links= cls._scrape_search_page(soup) for i in range(1,num_pages): await asyncio.sleep(cls.DELAY) html= await get_html(search_link + f"&st={25*i}", session); print("kedama: getting",search_link + f"&st={25*i}") soup= BeautifulSoup(html, 'html.parser') links+= cls._scrape_search_page(soup) # save threads to file for x in links: thread_id= cls.THREAD_ID_REGEX.search(x).group(1) out_file= utils.KEDAMA_HTML_DIR + thread_id + ".html" if os.path.exists(out_file): continue else: await asyncio.sleep(cls.DELAY) html= await get_html(x, session); print("kedama: getting (no-html)",x) with open(out_file, "w", encoding='utf-8') as file: file.write(html)
async def do_hv_login(session=None): CONFIG = utils.load_yaml(utils.BOT_CONFIG_FILE) if session is None: session = get_session() await session.get(CONFIG['hv_login_link']) return session
async def searchLottoWinners(winner): import utils from utils.scraper_utils import get_session lst = [] stats = { "Equips": [0, 0], "Chaos Token": [0, 0], "Chaos Tokens": [0, 0], "Golden Lottery Ticket": [0, 0], "Golden Lottery Tickets": [0, 0], "Caffeinated Candy": [0, 0], "Caffeinated Candies": [0, 0] } data = utils.load_json_with_default(utils.DATA_DIR + "lotto_data.json", default={ "w": {}, "a": {} }) numW = max([int(x) for x in data["w"].keys()]) if data['w'] else 0 numA = max([int(x) for x in data["a"].keys()]) if data['a'] else 0 if weaponLotteryOutdated(numW) or armorLotteryOutdated(numA): print("OUTDATED", numW, numA) await lotto_dl(get_session()) await lotto_parse() data = utils.load_json_with_default(utils.DATA_DIR + "lotto_data.json", default={ "w": {}, "a": {} }) for type in data: ks = list(data[type].keys()) ks.sort(key=lambda x: -int(x)) for num in ks: ent = data[type][num] if winner.lower() in [x.lower() for x in ent["winners"]]: date = f"#{num} / {ent['date'][0]} {ent['date'][1]}{ent['date'][2]}" lower = [x.lower() for x in ent['winners']] place = lower.index(winner) if place == 0: prize = ent["eq"].replace("Peerless ", "") stats["Equips"][0] += 1 stats["Equips"][1] += 1 gp = "" else: n = ent["prizes"][place - 1][0] nm = ent["prizes"][place - 1][1] prize = str(n) + " " + nm stats[nm][0] += n stats[nm][1] += 1 gp = ent["eq"].replace("Peerless ", "") tix = formatPrice(ent["tickets"]) print(date, prize, tix) lst.append([prize, gp, tix, date]) merge = lambda x, y: [str(x[0] + y[0]), str(x[1] + y[1])] stats["Chaos Tokens"] = merge(stats["Chaos Tokens"], stats["Chaos Token"]) del stats["Chaos Token"] stats["Golden Lottery Tickets"] = merge(stats["Golden Lottery Tickets"], stats["Golden Lottery Ticket"]) del stats["Golden Lottery Ticket"] stats["Caffeinated Candies"] = merge(stats["Caffeinated Candy"], stats["Caffeinated Candies"]) del stats["Caffeinated Candy"] stats["Equips"][0] = str(stats["Equips"][0]) stats["Equips"][1] = str(stats["Equips"][1]) return lst, stats
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.session = get_session() self.last_sent = 0
async def scrape(cls): # inits DATA = utils.load_json_with_default(utils.MARKET_ITEM_FILE) CACHE = utils.load_json_with_default(utils.MARKET_CACHE_FILE, default=cls.DEFAULT_CACHE) CACHE['invalid'] = set(CACHE['invalid']) target_page_number = 1 target_index = None session = get_session() html = await get_html(cls.BASE_LINK, session) # Loop logic: # 1. add results for current page to data # 2. calculate target_index # 3. check if done # (target_index >= num_results OR target_index >= a pending entry index) # 4. move to page containing target_index # (the target index may shift off-page by the time we visit the page due to new purchases, but doesnt matter, we'll get it eventually) # 5. go to step 1 while True: # step 1 result = cls.get_entries(html, target_page_number) DATA.update(result['entries']) total = result['total'] CACHE['invalid'] |= result['invalid_indices'] # step 2 if target_index is None: target_index = 1 # one-indexed from oldest while str( target_index) in DATA or target_index in CACHE['invalid']: target_index += 1 # step 3 if result['pending_indices'] and target_index >= min( result['pending_indices']): break if target_index >= total: break # step 4 target_page_number = cls.get_target_page(target_index, total) html = await get_html(cls.BASE_LINK + str(target_page_number), session) # be nice to lestion print(f"{(len(DATA.keys()) + len(CACHE['invalid']))} / {total}...", end="") await asyncio.sleep(cls.SCRAPE_DELAY) # intermediate save tmp = copy.deepcopy(CACHE) tmp['invalid'] = list(CACHE['invalid']) tmp['invalid'].sort() utils.dump_json(tmp, utils.MARKET_CACHE_FILE) utils.dump_json(DATA, utils.MARKET_ITEM_FILE) # final save CACHE['invalid'] = list(CACHE['invalid']) CACHE['invalid'].sort() utils.dump_json(CACHE, utils.MARKET_CACHE_FILE) utils.dump_json(DATA, utils.MARKET_ITEM_FILE)