def start(cls): root_url = "http://www.mangabz.com/manga-list-p%d/" page_num = 0 while True: cls.init_thread() page_num += 1 print("Now page " + str(page_num)) url = root_url % page_num resp = HttpUtils.get_with_retry(url, headers=cls.headers) if resp is None: break links = HttpUtils.get_attrs(resp, ".mh-item-detali > .title > a", "href") if len(links) == 0: break for link in links: cls.task_pool.put(link) cls.process_thread.join() cls.fp.close()
def parse_lvl_two(cls, info): url = info[0] index = info[1] # create folder once folder_name = "output/龙珠/" + str(index) if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url links = HttpUtils.get_attrs(resp, ".ListContainer .ItemThumb a", "style") assert links is not None for link in links: url = re.search("background:url\(.*'(.*)'", link).group(1).replace("_thumb.", "") file_name = url.split("/")[-1] cls.task_pool.put([folder_name + "/" + file_name, url, 0])
def async_sign(self): self.site = self.generate_site() while True: t = time.strftime("%M:%S", time.localtime()) if t.endswith("59"): break time.sleep(1) print(HttpUtils.get_time_stamp()) while True: t = int(datetime.datetime.now().microsecond / 10000) if t >= 90: break time.sleep(0.001) print(HttpUtils.get_time_stamp()) print("go go go!") while True: loop = asyncio.get_event_loop() loop.run_until_complete(self.run(500)) print(HttpUtils.get_time_stamp()) t = int(datetime.datetime.now().microsecond / 10000) print(t) if t >= 30: break time.sleep(0.001)
def parse_lvl_one(cls): if cls.book_id is None: return url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url cls.comic_name = HttpUtils.get_content(resp, "title").strip() links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href") titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a") assert len(titles) == len(links) cls.init_thread() for index in range(len(titles)): link = links[index] title = titles[index].strip() cls.parse_lvl_two((link, title)) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def load_weather_data(cls): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4,zh-TW;q=0.2", "Content-Type": "application/x-www-form-urlencoded", "Host": "d1.weather.com.cn", "Referer": "http://www.weather.com.cn/weather1d/%s.shtml" % cls.city_code } res = HttpUtils.get("http://d1.weather.com.cn/sk_2d/%s.html?_=%d" % (cls.city_code, round(time.time() * 1000)), headers=headers, return_raw=True) html = res.content.decode("utf-8") data = json.loads(html.replace("var dataSK = ", "")) res = HttpUtils.get( "http://d1.weather.com.cn/dingzhi/%s.html?_=%d" % (cls.city_code, round(time.time() * 1000)), headers=headers, return_raw=True) html = res.content.decode("utf-8") html2 = html.replace("var cityDZ101020100 =", "").replace(";var alarmDZ101020100 ={\"w\":[]}", "") data2 = json.loads(html2).get("weatherinfo") return "今天%s,最高气温%s,最低气温%s,%s%s, 当前气温%s,空气质量指数%s,相对湿度%s" % ( data2.get("weather"), data2.get("temp"), data2.get("tempn"), data2.get("wd"), data2.get("ws"), data.get("temp"), data.get("aqi"), data.get("sd"))
def crawl_sub_category_book(cls, sub_category_meta): ku_book_title_list = list() category_name = sub_category_meta[0] sub_category_name = sub_category_meta[1] sub_category_link = cls.amazon_base_url + sub_category_meta[2] page_num = int(sub_category_meta[3]) for page in range(1, page_num + 1): print("reading cat=%s,sub-cat=%s,page=%s" % (category_name, sub_category_name, page)) url = sub_category_link.split("%page=")[0] + "&page=" + str(page) soup_obj = HttpUtils.get(url, headers=cls.amazon_headers) if soup_obj is None: print("blocked?") break title_list = HttpUtils.get_contents( soup_obj, "div.s-result-list div.sg-col-inner h2.a-size-mini span.a-size-medium" ) current_page_title_list = list() for title in title_list: # remove meta info title = title.split("(")[0].split("(")[0].split("【")[0] ku_book_title_list.append(title) current_page_title_list.append(title) print(current_page_title_list) sleep(random() * 0.5 + 0.5) return ku_book_title_list
def get_score(self): self.check_in() soup = HttpUtils.get("http://www.miui.com/space-uid-2248502469.html") assert soup is not None score = HttpUtils.get_content( soup, "#statistic_content li:nth-of-type(1) a") return int(score)
def parse_users(cls, url): soup_obj = HttpUtils.get(url) if soup_obj is None: print(">>>>>> Fail to parse " + url) return None data_state = HttpUtils.get_attr(soup_obj, "#data", "data-state") data_map = json.loads(data_state) return data_map['entities']['users']
def parse_lvl_one(cls): if cls.book_id is None: print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<") return resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id), headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() cls.root_folder = os.path.join("output", cls.comic_name) links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href") titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") image_numbers = HttpUtils.get_contents( resp, "div.detail-list-form-con a span") image_numbers = list( map(lambda x: re.search("(\d+)P", x).group(1), image_numbers)) assert len(titles) == len(image_numbers) assert len(titles) == len(links) cnt = 0 for index in range(len(titles)): cls.init_thread() link = links[index].replace("/", "").replace("m", "") title = titles[index].strip() image_number = image_numbers[index] if (cls.chapter_mode == 1 and "第" not in title and "话" not in title and "話" not in title) or (cls.chapter_mode == 2 and "卷" not in title and "第" not in title): print("Skip " + title) continue is_skip = False if cls.inclusion_list is not None: for inclusion in cls.inclusion_list: if inclusion not in title: is_skip = True break if not is_skip and cls.parse_lvl_two((link, title, image_number)): cnt += 1 if cnt > 0: cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def check_and_notify(cls): url = "https://www.flyertea.com/forum.php?mod=forumdisplay&orderby=dateline&sum=226&fid=226&mobile=2" soup_obj = HttpUtils.get(url, return_raw=False) titles = list(map(lambda title: title.strip(), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk p.n5_htnrbt"))) readers = list(map(lambda x: int(x), HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_hthfcs"))) flowers = list( map(lambda x: int(x) if x else 0, HttpUtils.get_contents(soup_obj, "div.n5sq_htmk div.n5_htdzcs"))) print(titles) print(readers) print(flowers)
def action(self, data): vote_url = "https://kp.m-team.cc/vote.php?tid=%s&type=1" success_cnt = 0 for id in data: res_obj = HttpUtils.get(url=vote_url % id, headers=self.site.login_headers) msg = HttpUtils.get_content(res_obj, "#outer table h2") if msg == "操作成功": success_cnt += 1 print("Vote success: " + str(success_cnt))
def parse_current_seeds(cls, print_log=True): seeds = [] cmd_result = os.popen("transmission-remote -l").read() lines = cmd_result.split("\n")[1: -2] # remove first and last line now = datetime.datetime.now() for line in lines: seed = TransmissionSeed() seeds.append(seed) data = line.split() seed.id = data[0].replace("*", "") cmd_result = os.popen("transmission-remote -t {0} -i".format(seed.id)).read() seed_details = cmd_result.split("\n") for detail in seed_details: if detail.startswith(" Name: "): seed.name = detail.replace(" Name: ", "") elif detail.startswith(" State: "): seed.status = detail.replace(" State: ", "") elif detail.startswith(" Percent Done:"): seed.done = float(detail.replace(" Percent Done: ", "").replace('%', '')) elif detail.startswith(" ETA: "): seed.ETA = detail.replace(" ETA: ", "").replace(" ", "").split("(")[0] elif detail.startswith(" Download Speed: "): seed.down = HttpUtils.pretty_format( detail.replace(" Download Speed: ", "").replace(" ", "").split("/s")[0], "KB") elif detail.startswith(" Upload Speed: "): seed.up = HttpUtils.pretty_format( detail.replace(" Upload Speed: ", "").replace(" ", "").split("/s")[0], "KB") elif detail.startswith(" Total size: "): seed.size = HttpUtils.pretty_format( detail.replace(" Total size: ", "").replace(" ", "").split("(")[0], "MB") elif detail.startswith(" Ratio: "): ratio_str = detail.replace(" Ratio: ", "") if ratio_str == "None": seed.ratio = 0.0 else: seed.ratio = float(ratio_str) elif detail.startswith(" Date added: "): start_time = parser.parse(detail.replace(" Date added: ", "").strip()) seed.since = (now - start_time).seconds elif detail.startswith(" Downloaded: "): seed.done_size = HttpUtils.pretty_format( detail.replace(" Downloaded: ", ""), "KB") elif detail.startswith(" Location: "): seed.location = detail.replace(" Location: ", "") if print_log: for seed in seeds: print(seed) return seeds
def say_thank(self, id): site = self.generate_site() assert self.login(site) url = "http://hdhome.org/thanks.php" form_data = {"id": id} HttpUtils.post(url, data=form_data, headers=self.site.login_headers, returnRaw=True) print("Say thanks to " + str(id))
def parse_captcha(self, site): soup_obj = HttpUtils.get("https://pt.sjtu.edu.cn/login.php", headers=site.login_headers) captcha_image_list = soup_obj.select("form img") # if captcha image exists, parse expression and return if len(captcha_image_list) > 0: image_url = "https://pt.sjtu.edu.cn/" + captcha_image_list[0]["src"] HttpUtils.download_file(image_url, "/tmp/cap.png", over_write=True) return PuTaoCaptchaParser.analyze("/tmp/cap.png") else: return "XxXx"
def sign(self): self.check_in() print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) time_start = time.time() for i in range(100): HttpUtils.get( "http://www.miui.com/extra.php?mod=sign/index&op=sign", headers=self.site.login_headers, return_raw=True) time_end = time.time() print('time cost', time_end - time_start, 's') print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
def stat(self, unit="GB", update_cache=True): self.login_if_not() soup_obj = HttpUtils.get(self.site.stat_page, headers=self.site.login_headers) assert soup_obj is not None div_list = soup_obj.select( "table.mainouter tr td table tr td div[align='center']") assert len(div_list) == 1 content = div_list[0].contents[0] m = re.search(u"获取(\d+.\d+)个魔力", content) assert m mp = float(m.group(1)) span_list = soup_obj.select("#usermsglink span") up = HttpUtils.pretty_format(span_list[1].contents[2], unit) down = HttpUtils.pretty_format(span_list[1].contents[4], unit) prev_up = Cache().get(self.get_site_name() + "_up") prev_down = Cache().get(self.get_site_name() + "_down") if prev_up is None: prev_up = 0 else: prev_up = float(prev_up.decode()) if prev_down is None: prev_down = 0 else: prev_down = float(prev_down.decode()) delta_up = round(up - prev_up, 2) delta_down = round(down - prev_down, 2) if delta_down == 0: delta_ratio = "Inf" else: delta_ratio = round(delta_up / delta_down, 2) current_upload = round(up - down, 2) print( "%s, mp=%s, up=%s, down=%s, current=%s, delta_up=%s, delta_down=%s, delta_ratio=%s" % (str(time.strftime("%Y-%m-%d %H:%M:%S")), mp, up, down, current_upload, delta_up, delta_down, delta_ratio)) if update_cache: Cache().set(self.get_site_name() + "_up", up) Cache().set(self.get_site_name() + "_down", down) return mp, up, down
def login(self, site): if not self.isLogin and site.login_needed and not self.check_login(site): if site.need_captcha: site.login_captcha_value = self.parse_captcha(site) # trigger login action HttpUtils.post(site.login_page, data=self.build_post_data(site), headers=site.login_headers, returnRaw=True) self.isLogin = self.check_login(site) return self.isLogin else: self.isLogin = True return True
def check_login(self, site): HttpUtils.create_session_if_absent() HttpUtils.load_cookie() soup_obj = HttpUtils.get(site.home_page, headers=site.login_headers) content = HttpUtils.get_content(soup_obj, site.login_verify_css_selector) print("Current user is " + str(content)) result = content is not None and content == site.login_verify_str if result: HttpUtils.save_cookie() else: HttpUtils.clear_cookie() return result
def parse_lvl_two(cls, info): chapter_url = info[0] title = info[1] # create folder once folder_name = "output/" + cls.comic_name + "/" + title if not os.path.exists(folder_name): os.makedirs(folder_name, exist_ok=True) # # path_file_number = len(glob.glob(pathname=folder_name + '/*')) # if path_file_number == image_number: # print("下载完毕:" + title) # # already downloaded all # return print("开始下载: " + title) query_url = cls.root_url + chapter_url retry = 0 while True: content = HttpUtils.get(query_url, headers=cls.headers) if content is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % query_url script_content = HttpUtils.get_contents(content, "script") print(script_content[2][1:].replace(";;", ";").replace(";", ";\n")) image_url_list = re.search("chapterImages.*=.*\[(.*)\];", script_content[2]).group(1).replace( "\"", "").split(",") path = re.search("chapterPath.*?=.*?\"(.*?)\";", script_content[2]).group(1) assert len(image_url_list) > 0 index = 1 for image_url in image_url_list: full_image_url = "http://js1.zzszs.com.cn/" + path + image_url file_path = "%s/%03d_%s" % (folder_name, index, image_url) cls.task_pool.put([file_path, full_image_url, 0]) index += 1
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None json_data = json.loads(content.text) book = json_data["data"]["animeName"] title = json_data["data"]["title"] number = json_data["data"]["numberStart"] images = json_data["data"]["contentImg"] # create folder once ''' folder_name = "%s/%03d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) for image in images: image_file_name = image["name"] image_url = image["url"] file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0]) ''' folder_name = "%s/%03d_%s" % (book, int(number), title) for image in images: image_file_name = image["name"] image_url = image["url"] file_path = folder_name + image_file_name cls.task_pool.put([file_path, image_url, 0])
def parse_lvl_one(cls): if cls.book_id is None: return resp = HttpUtils.get( "https://api.ishuhui.shop/ver/4e198319/anime/detail?id=%d&type=comics&.json" % cls.book_id, return_raw=True) assert resp is not None json_data = json.loads(resp.text) cartoons = json_data["data"]["comicsIndexes"]["1"]["nums"] cls.init_thread() for type in cartoons.keys(): posts = cartoons[type] for index in posts.keys(): post_id = posts[index][0]["id"] final_url = "https://prod-api.ishuhui.com/comics/detail?id=%s" % post_id cls.parse_lvl_two(final_url) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def parse(self, soup_obj): assert soup_obj is not None tr_list = soup_obj.select("table.torrents tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue seed.sticky = len( td_list[1].select("table td img[alt=\"Sticky\"]")) > 0 seed.title = td_list[1].select("table td a")[0]["title"] seed.url = td_list[1].select("table td a")[0]['href'] seed.free = len(td_list[1].select("table font.free")) > 0 seed.hot = len(td_list[1].select("table font.hot")) > 0 seed.since = HttpUtils.get_content(td_list[3], "span") seed.size = float(self.parse_size(td_list[4])) seed.upload_num = int(self.clean_tag(td_list[5])) seed.download_num = int(self.clean_tag(td_list[6])) seed.finish_num = int(self.clean_tag(td_list[7])) seed.id = self.parse_id(seed.url) seeds.append(seed) return seeds
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None m = re.search("chapter: \$\.evalJSON\(\'(.*)\'\),", content.text) if not m or m.group(1) == "null": m = re.search("chapter: (.*),", content.text) assert m json_data = json.loads(m.group(1)) book = json_data["comic_id"] number = json_data["chapter_id"] title = json_data["name"].strip().replace(" ", "-").replace( "(", "(").replace(")", ")") # create folder once folder_name = "%s/%08d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) m = re.search("image_list: \$\.evalJSON\(\'(.*)\'\),", content.text) if not m or m.group(1) == "null": m = re.search("image_list: (.*),", content.text) assert m json_data = json.loads(m.group(1)) for index in json_data.keys(): image_data = json_data[index] page = image_data["page"] image_url = base64.decodebytes( image_data["src"].encode("utf-8")).decode("utf-8") format = image_url.split(".")[-1] image_file_name = "%03d.%s" % (int(page), format) file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0])
def parse_lvl_two(cls, url): content = HttpUtils.get(url, return_raw=True) assert content is not None location = os.path.join(os.path.dirname(__file__), "../bin/phantomjs") jsFile = os.path.join(os.path.dirname(__file__), "../static/tencent_comic.js") print(">>> parsing " + url) data = os.popen("%s %s %s" % (location, jsFile, url)).read() # retry twice if data is None: data = os.popen("%s %s %s" % (location, jsFile, url)).read() assert data is not None print("****** data=" + data) json_data = json.loads(data) book = json_data["title"] number = json_data["cid"] title = json_data["cTitle"].strip().replace(" ", "-").replace( "(", "(").replace(")", ")") # create folder once folder_name = "%s/%08d_%s" % (book, int(number), title) if not os.path.exists(folder_name): os.makedirs(folder_name) for index in json_data["picture"].keys(): image_url = json_data["picture"][index] format = "png" image_file_name = "%03d.%s" % (int(index), format) file_path = "/".join([folder_name, image_file_name]) cls.task_pool.put([file_path, image_url, 0])
def parse_size(self, soup_obj): assert soup_obj is not None assert len(soup_obj.contents) == 3 size_num = round(float(soup_obj.contents[0]) * self.size_factor, 2) size_unit = soup_obj.contents[2] return HttpUtils.pretty_format(str(size_num) + str(size_unit), "MB")
def check_login(self, site): resp = HttpUtils.post(site.home_page, data={}, returnRaw=True).text jsonValue = json.loads(resp) if jsonValue['errNo'] == 0: content = jsonValue['data']['name'] return content is not None and content == site.login_verify_str else: return False
def parse(self, soup_obj): assert soup_obj is not None info_block = soup_obj.select( "#info_block table tr td:nth-of-type(1) span")[0] prev_info = "" upload = 0 download = 0 for info in info_block.contents: if "上傳量" in prev_info: upload = HttpUtils.pretty_format(info, "GB") elif "下載量" in prev_info: download = HttpUtils.pretty_format(info, "GB") break prev_info = str(info) return upload, download
def read_msg(self, index): self.login_if_not() soup_obj = HttpUtils.get(self.url + index, headers=self.site.login_headers) assert soup_obj is not None tr_list = soup_obj.select("#outer form table tr") messages = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue td_list = tr.select("td.rowfollow") if len(td_list) < 4: # skip footer continue msg = Message() msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0 msg.title = HttpUtils.get_content(td_list[1], "a") msg.from_user = HttpUtils.get_content(td_list[2], "span a b") if msg.from_user is None: # for ad. msg.from_user = td_list[2].contents[0] msg.since = HttpUtils.get_content(td_list[3], "span") link = HttpUtils.get_attr(td_list[1], "a", "href") msg.id = link.split("id=")[1] messages.append(msg) print("--------------------------------------") index = 1 for msg in messages: print("{:<2}|".format(index) + str(msg)) index += 1 print("--------------------------------------") return messages
def crawl(self): site = self.generate_site() assert self.login(site) for i in range(107, 164): soup_obj = HttpUtils.get(site.home_page + "?page=" + str(i), headers=site.login_headers) ids = self.parse(soup_obj) ParallelTemplate(150).run(func=self.say_thank, inputs=ids) print(">>>>>> finish page " + str(i))
def init_setting(self): self.login_if_not() # enable adult torrent setting_url = "https://kp.m-team.cc/usercp.php" lab_data = { "action": "laboratory", "type": "save", "laboratory_adult_mode": "0", "laboratory_torrent_page_https": "0" } res = HttpUtils.post(url=setting_url, data=lab_data, headers=self.site.login_headers, returnRaw=True) assert res.status_code == 200 # do not show picture tracker_data = { "action": "tracker", "type": "save", "t_look": "1", # show pic "tooltip": "off", "timetype": "timealive", "appendsticky": "yes", "radio": "icon", "smalldescr": "yes", "dlicon": "yes", "bmicon": "yes", "show_hot": "yes", "showfb": "yes", "showdescription": "yes", "showimdb": "yes", "showcomment": "yes", "appendnew": "yes", "appendpicked": "yes", "showcomnum": "yes" } res = HttpUtils.post(url=setting_url, data=tracker_data, headers=self.site.login_headers, returnRaw=True) assert res.status_code == 200