def parse(self, soup_obj): assert soup_obj is not None tr_list = soup_obj.select("table.torrents tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue seed.sticky = len( td_list[1].select("table td img[alt=\"Sticky\"]")) > 0 seed.title = td_list[1].select("table td a")[0]["title"] seed.url = td_list[1].select("table td a")[0]['href'] seed.free = len(td_list[1].select("table font.free")) > 0 seed.hot = len(td_list[1].select("table font.hot")) > 0 seed.since = HttpUtils.get_content(td_list[3], "span") seed.size = float(self.parse_size(td_list[4])) seed.upload_num = int(self.clean_tag(td_list[5])) seed.download_num = int(self.clean_tag(td_list[6])) seed.finish_num = int(self.clean_tag(td_list[7])) seed.id = self.parse_id(seed.url) seeds.append(seed) return seeds
def parse_lvl_one(cls): if cls.book_id is None: return url = "http://www.js518.net/mohuanmanhua/%s/" % cls.book_id retry = 0 while True: resp = HttpUtils.get(url) if resp is not None: break else: retry += 1 assert retry < 5, "fail to query %s" % url cls.comic_name = HttpUtils.get_content(resp, "title").strip() links = HttpUtils.get_attrs(resp, "#mh-chapter-list-ol-0 a", "href") titles = HttpUtils.get_contents(resp, "#mh-chapter-list-ol-0 a") assert len(titles) == len(links) cls.init_thread() for index in range(len(titles)): link = links[index] title = titles[index].strip() cls.parse_lvl_two((link, title)) cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def get_score(self): self.check_in() soup = HttpUtils.get("http://www.miui.com/space-uid-2248502469.html") assert soup is not None score = HttpUtils.get_content( soup, "#statistic_content li:nth-of-type(1) a") return int(score)
def read_msg(self, index): self.login_if_not() soup_obj = HttpUtils.get(self.url + index, headers=self.site.login_headers) assert soup_obj is not None tr_list = soup_obj.select("#outer form table tr") messages = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue td_list = tr.select("td.rowfollow") if len(td_list) < 4: # skip footer continue msg = Message() msg.read = len(td_list[0].select("img[alt=\"Read\"]")) > 0 msg.title = HttpUtils.get_content(td_list[1], "a") msg.from_user = HttpUtils.get_content(td_list[2], "span a b") if msg.from_user is None: # for ad. msg.from_user = td_list[2].contents[0] msg.since = HttpUtils.get_content(td_list[3], "span") link = HttpUtils.get_attr(td_list[1], "a", "href") msg.id = link.split("id=")[1] messages.append(msg) print("--------------------------------------") index = 1 for msg in messages: print("{:<2}|".format(index) + str(msg)) index += 1 print("--------------------------------------") return messages
def parse_lvl_one(cls): if cls.book_id is None: print(">>>>> ERROR Cannot Parse Comic ID, QUIT! <<<<<") return resp = HttpUtils.get_with_retry("%s/%s/" % (cls.root_url, cls.book_id), headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() cls.root_folder = os.path.join("output", cls.comic_name) links = HttpUtils.get_attrs(resp, "div.detail-list-form-con a", "href") titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") image_numbers = HttpUtils.get_contents( resp, "div.detail-list-form-con a span") image_numbers = list( map(lambda x: re.search("(\d+)P", x).group(1), image_numbers)) assert len(titles) == len(image_numbers) assert len(titles) == len(links) cnt = 0 for index in range(len(titles)): cls.init_thread() link = links[index].replace("/", "").replace("m", "") title = titles[index].strip() image_number = image_numbers[index] if (cls.chapter_mode == 1 and "第" not in title and "话" not in title and "話" not in title) or (cls.chapter_mode == 2 and "卷" not in title and "第" not in title): print("Skip " + title) continue is_skip = False if cls.inclusion_list is not None: for inclusion in cls.inclusion_list: if inclusion not in title: is_skip = True break if not is_skip and cls.parse_lvl_two((link, title, image_number)): cnt += 1 if cnt > 0: cls.process_thread.join() # code below should be useless if everything goes well while not cls.task_pool.empty(): print("pool size = " + str(cls.task_pool.qsize())) cls.init_thread() cls.process_thread.join()
def action(self, data): vote_url = "https://kp.m-team.cc/vote.php?tid=%s&type=1" success_cnt = 0 for id in data: res_obj = HttpUtils.get(url=vote_url % id, headers=self.site.login_headers) msg = HttpUtils.get_content(res_obj, "#outer table h2") if msg == "操作成功": success_cnt += 1 print("Vote success: " + str(success_cnt))
def parse_page(self, soup_obj): tr_list = soup_obj.select("#torrent_table tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td") if len(td_list) < 10: continue seed.sticky = len(td_list[1].select("div img[alt=\"置顶\"]")) seed.title = HttpUtils.get_content(td_list[1].select("div a b")) seed.url = td_list[1].select("div a")[0]['href'] seed.free = len(td_list[1].select("div a img[alt=\"free\"]")) > 0 seed.since = HttpUtils.get_content(td_list[3], "span") seed.size = float(self.parse_size(td_list[4])) seed.upload_num = int(self.clean_tag(td_list[5])) seed.download_num = int(self.clean_tag(td_list[6])) seed.finish_num = int(self.clean_tag(td_list[7])) seed.id = self.parse_id(seed.url) # parse discount if len(td_list[1].select("table td font.halfdown")) > 0: seed.discount = 50 elif len(td_list[1].select("table td font.d30down")) > 0: seed.discount = 30 else: seed.discount = 100 seeds.append(seed) return seeds
def parse(self, soup_obj): assert soup_obj is not None tr_list = soup_obj.select("table.torrents tr") seeds = [] cnt = 0 for tr in tr_list: cnt += 1 if cnt == 1: # skip the caption tr continue seed = SeedInfo() td_list = tr.select("td.rowfollow") if len(td_list) < 9: # skip embedded contents continue seed.since = HttpUtils.get_content(td_list[2], "span") seed.size = float(self.parse_size(td_list[3])) seed.upload_num = int(self.clean_tag(td_list[4])) seed.download_num = int(self.clean_tag(td_list[5])) seed.finish_num = int(self.clean_tag(td_list[6])) seed.done = self.clean_tag(td_list[7]) seed.working = "peer-active" in td_list[7]['class'] td_title = tr.select("td.torrenttr tr td") seed.sticky = len(td_title[0].select("img[alt=\"Sticky\"]")) seed.title = td_title[0].select("a")[0]["title"] seed.url = td_title[0].select("a")[0]['href'] seed.free = len(td_title[0].select("img[alt=\"Free\"]")) > 0 seed.hot = len(td_title[0].select("font.hot")) > 0 if len(td_title[0].select("img[alt=\"50%\"]")) > 0: seed.discount = 50 elif len(td_title[0].select("img[alt=\"30%\"]")) > 0: seed.discount = 30 elif seed.free: seed.discount = 0 else: seed.discount = 100 seed.id = self.parse_id(seed.url) seeds.append(seed) print("Crawl: " + str(len(seeds))) if len(seeds) < 10: EmailSender.send(u"无法解析页面", Config.get("mteam_username")) return seeds
def check_login(self, site): HttpUtils.create_session_if_absent() HttpUtils.load_cookie() soup_obj = HttpUtils.get(site.home_page, headers=site.login_headers) content = HttpUtils.get_content(soup_obj, site.login_verify_css_selector) print("Current user is " + str(content)) result = content is not None and content == site.login_verify_str if result: HttpUtils.save_cookie() else: HttpUtils.clear_cookie() return result
def parse_page(self, soup_obj): items = soup_obj.select("item") assert len(items) != 0 seeds = [] for item in items: try: info = HttpUtils.get_content(item, "title").split("[") seed = SeedInfo() seed.title = info[0].strip() seed.size = HttpUtils.pretty_format(info[1].split("]")[0], "MB") seed.url = HttpUtils.get_attr(item, "enclosure", "url") seed.id = self.parse_id(seed.url) #Cache().set(seed.id, str(seed)) seeds.append(seed) except Exception as e: print(e.getMessage()) return seeds
def parse_page(self, soup_obj): items = soup_obj.select("item") assert len(items) != 0 seeds = [] for item in items: try: info = HttpUtils.get_content(item, "title").split("[") seed = SeedInfo() seed.title = info[0].strip() seed.size = HttpUtils.pretty_format(info[1].split(" ")[-2] + info[1].split(" ")[-1], "MB") # seed.url = HttpUtils.get_content(item, "link") seed.url = item.contents[4] seed.id = self.parse_id(seed.url) seeds.append(seed) except Exception as e: pass return seeds
def crawl_book(cls): tag_source_url = "https://book.douban.com/tag/" soup_obj = HttpUtils.get(tag_source_url) tags = HttpUtils.get_contents(soup_obj, "div.article tr td a") tags = [ '小说', '外国文学', '文学', '中国文学', '经典', '日本文学', '古典文学', '王小波', '当代文学', '钱钟书', '外国名著', '推理', '绘本', '青春', '东野圭吾', '科幻', '言情', '悬疑', '奇幻', '韩寒', '推理小说', '阿加莎·克里斯蒂', '科幻小说', '魔幻', '历史', '心理学', '哲学', '传记', '文化', '社会学', '艺术', '设计', '社会', '政治', '建筑', '宗教', '电影', '政治学', '数学', '中国历史', '回忆录', '思想', '国学', '人物传记', '人文', '音乐', '艺术史', '绘画', '戏剧', '西方哲学', '二战', '军事', '佛教', '近代史', '考古', '自由主义', '美术', '爱情', '旅行', '成长', '生活', '心理', '励志', '摄影', '教育', '游记', '灵修', '健康', '情感', '两性', '人际关系', '手工', '养生', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融', '投资', '营销', '理财', '创业', '广告', '股票', '企业史', '策划', '科普', '互联网', '编程', '科学', '交互设计', '用户体验', '算法', '科技', 'web', 'UE', '交互', '通信', 'UCD', '神经网络', '程序' ] print(tags) book_shelf = dict() for tag in tags: for page in range(0, 10): url = "https://book.douban.com/tag/%s?start=%d&type=T" % ( tag, page * 20) soup_obj = HttpUtils.get(url) if soup_obj is None: print("blocked?") break print(tag, page) books_obj = soup_obj.select("#subject_list ul > li") if len(books_obj) == 0: break for book_obj in books_obj: try: title = HttpUtils.get_attr(book_obj, "h2 a", "title") rating = float( HttpUtils.get_content(book_obj, "span.rating_nums")) people = int( HttpUtils.get_content(book_obj, "span.pl").strip().replace( "人评价", "").replace( "(", "").replace(")", "")) if people > cls.people_threshold: if title in book_shelf: book_shelf[title].tag.append(tag) else: book_shelf[title] = Book( title, rating, people, [tag]) except Exception as e: pass # 为了应对时间窗口内单 ip 访问数量限制,只是停顿一下 sleep(random() * 0.5 + 0.5) books = list(book_shelf.values()) with open("douban_book_raw.txt", "w") as fp: fp.write(json.dumps(books, default=Book.convert))
def crawl_single(self, user_id): if self.skip_if_exist and self.cache.hash_get(self.id_bucket_name, user_id) is not None: print("Skip " + str(user_id)) return try: url = self.site.home_page % str(user_id) soup_obj = HttpUtils.get(url, headers=self.site.login_headers, return_raw=False) assert soup_obj is not None user = User() user.id = user_id user.name = HttpUtils.get_content(soup_obj, "#outer h1 span b") if user.name is None: return user.is_warn = len( soup_obj.select("#outer h1 span img[alt='Leechwarned']")) > 0 user.is_ban = len( soup_obj.select("#outer h1 span img[alt='Disabled']")) > 0 if user.is_warn: user.warn_time = str(time.strftime("%Y-%m-%d %H:%M:%S")) try: if len(soup_obj.select("#outer table tr")) <= 5: user.is_secret = True # print("secret user: name={0} id={1}".format(user.name, str(user_id))) else: tr_list = soup_obj.select("#outer table tr") for tr in tr_list: td_name = HttpUtils.get_content( tr, "td:nth-of-type(1)") if td_name == "加入日期": user.create_time = HttpUtils.get_content( tr, "td:nth-of-type(2)").replace(" (", "") elif td_name == "最近動向": user.last_time = HttpUtils.get_content( tr, "td:nth-of-type(2)").replace(" (", "") elif td_name == "傳送": user.ratio = HttpUtils.get_content( tr, "td:nth-of-type(2) table tr td font") if user.ratio is None: # seems that no download is made and ratio is infinite user.ratio = -1 user.up = self.parse_size_in_gb( HttpUtils.get_content( tr, "td:nth-of-type(2) table tr:nth-of-type(1) td:nth-of-type(1)", 1)) user.down = self.parse_size_in_gb( HttpUtils.get_content( tr, "td:nth-of-type(2) table tr:nth-of-type(1) td:nth-of-type(2)", 2)) else: user.ratio = user.ratio.replace(",", "") user.up = self.parse_size_in_gb( HttpUtils.get_content( tr, "td:nth-of-type(2) table tr:nth-of-type(2) td:nth-of-type(1)", 1)) user.down = self.parse_size_in_gb( HttpUtils.get_content( tr, "td:nth-of-type(2) table tr:nth-of-type(2) td:nth-of-type(2)", 2)) elif td_name == "魔力值": user.mp = HttpUtils.get_content( tr, "td:nth-of-type(2)") # parse rank user.rank = "secret" imgs = soup_obj.select( "table.main table tr > td > img[title!='']") for img in imgs: if not img.has_attr("class"): user.rank = img["title"] if "Peasant" in user.rank: user.warn_time = str( time.strftime("%Y-%m-%d %H:%M:%S")) # print("###### find user="******" id=" + str(user_id) + " rank=" + user.rank) except Exception as e: print(str(user_id) + "\n" + str(e) + "\n") self.buffer.append(user) except Exception as e: print(">>>>> fail to parse " + str(user_id)) self.errors.append(user_id)
def water_copy(self): self.check_in() forum_id_list = ["772", "773"] forum_id = forum_id_list[int(random() * len(forum_id_list)) - 1] article_url_template = "http://www.miui.com/forum.php?mod=forumdisplay&fid={0}&orderby=replies&filter=reply&orderby=replies&page={1}" page_num = 1 max_cnt = 50 reply_list = dict() stop_flag = False while not stop_flag: soup_obj = HttpUtils.get( article_url_template.format(forum_id, page_num)) print("current page: " + str(page_num)) page_num += 1 article_list = soup_obj.select("tbody") for article in article_list: id = article.attrs["id"] if not id.startswith("normalthread"): continue id = id[13:] if Cache().get(id) is not None: print("Skip " + id) # has been replied within a few days, skip continue title = HttpUtils.get_content(article, ".sub-tit > a:nth-of-type(1)") # don't want to copy comments of author author = HttpUtils.get_content(article, ".sub-infos a:nth-of-type(1)") reply_num = HttpUtils.get_content( article, "span.number_d a:nth-of-type(1)") total_thread_page_num = int(int(reply_num) / 10) start_thread_page_num = int(total_thread_page_num / 3) end_thread_page_num = start_thread_page_num * 2 current_thread_page_num = start_thread_page_num + int( random() * 3) content_candidates = list() while len( content_candidates ) == 0 and current_thread_page_num <= end_thread_page_num: page_url = self.page_url_template_copy.format( id, current_thread_page_num) current_thread_page_num += 1 page_soup_obj = HttpUtils.get( page_url, headers=self.site.login_headers) assert page_soup_obj is not None # check if allow to reply edit_content = HttpUtils.get_content( page_soup_obj, "#fastposteditor .pt") if edit_content is not None and "您现在无权发帖" in str( edit_content): Cache().set(id, "") print(id + " not allowed to reply") break # skip vote(less score) form = page_soup_obj.select("#poll", limit=1) if form is not None and len(form) > 0: Cache().set(id, "") print(id + " skip vote") break post_list = page_soup_obj.select("#postlist > div") for post in post_list: try: current_author = HttpUtils.get_content( post, ".authi a") if current_author == author: continue score = int( HttpUtils.get_content(post, ".pil dd a")) if score < 1500: continue content = HttpUtils.get_content( post, ".pct table tr td.t_f") if content is None or content.strip( ) == "" or len(content) < 10 or len(content) > 50: continue if author in content: continue contain_black_list = False for black_word in self.comments_black_list: if black_word in content: contain_black_list = True break if contain_black_list: continue content_candidates.append(content.strip()) except: pass print(title) print(content_candidates) if len(content_candidates) > 0: # randomly pick one reply_list[id] = content_candidates[ int(random() * len(content_candidates)) - 1] print(id + " -- " + reply_list[id]) print("current reply=" + str(len(reply_list))) if len(reply_list) >= max_cnt: stop_flag = True break # start reply for thread_id in reply_list: try: message = reply_list[thread_id] post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["usesig"] = "1" post_data["subject"] = " " post_data["message"] = message form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid={0}&tid={1}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format( forum_id, thread_id) print(thread_id, message, self.get_score()) post_result = HttpUtils.post(form_submit_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None Cache().set_with_expire(thread_id, message, 86400 * 4) time.sleep(int(random() * 60) + 90) except: pass
def zz_copy(self): source_url_template = "http://www.miui.com/forum.php?mod=forumdisplay&fid=773&orderby=dateline&filter=author&orderby=dateline&page={0}" thread_url_template = "http://www.miui.com/thread-{0}-1-1.html" post_url = "http://www.miui.com/forum.php?mod=post&action=newthread&fid=773&extra=&topicsubmit=yes" min_page_num = 300 self.check_in() title_white_list = ["问题", "探索版", "怎么", "什么"] title_black_list = ["内测", "发货", "积分", "在线"] page_num = min_page_num + int(random() * 700) max_cnt = 20 article_candidates = dict() stop_flag = False while not stop_flag: try: soup_obj = HttpUtils.get(source_url_template.format(page_num)) page_num -= 1 assert soup_obj is not None print("current page: " + str(page_num)) article_list = soup_obj.select("tbody") for article in article_list: id = article.attrs["id"] if not id.startswith("normalthread"): continue id = id[13:] if Cache().get("ZZ_" + id) is not None: print("Skip " + id) # has been ZZed within a few days, skip continue title = HttpUtils.get_content( article, ".sub-tit > a:nth-of-type(1)") reply_num = int( HttpUtils.get_content( article, "span.number_d a:nth-of-type(1)")) if reply_num > 8: continue is_white_list = False for white_list in title_white_list: if white_list in title: is_white_list = True if not is_white_list: break is_black_list = False for black_list in title_black_list: if black_list in title: is_black_list = True if is_black_list: break thread_soup_obj = HttpUtils.get( thread_url_template.format(id)) assert thread_soup_obj is not None content = HttpUtils.get_content(thread_soup_obj, "#postlist > div .t_f") if content is None or content.strip() == "": continue article_candidates[id] = (title, content.strip()) if len(article_candidates) >= max_cnt: stop_flag = True break except: pass for id in article_candidates: try: (title, message) = article_candidates[id] post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["wysiwyg"] = "1" post_data["typeid"] = "7562" post_data["allownoticeauthor"] = "1" post_data["addfeed"] = "1" post_data["usesig"] = "1" post_data["save"] = "" post_data["uploadalbum"] = "-2" post_data["newalbum"] = "请输入相册名称" post_data["subject"] = title post_data["message"] = message print((title, message)) post_result = HttpUtils.post(post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None Cache().put("ZZ_" + id) time.sleep(int(random() * 300) + 1800) except: pass
def vote(self): self.check_in() source_list_url_template = "http://www.miui.com/home.php?mod=space&uid=133153462&do=thread&view=me&order=dateline&from=space&page={0}" page_num = 1 max_cnt = 10 cnt = 0 stop_flag = False while not stop_flag: soup = HttpUtils.get(source_list_url_template.format(page_num), headers=self.site.login_headers) assert soup is not None page_num += 1 current_score = self.get_score() previous_score = current_score article_urls = HttpUtils.get_attrs(soup, "div.tl th > a", "href") for article_url in article_urls: try: article_url = "http://www.miui.com/" + article_url article_soup = HttpUtils.get( article_url, headers=self.site.login_headers) assert article_soup is not None title = HttpUtils.get_content(article_soup, "title") form = article_soup.select("#poll", limit=1) option = article_soup.select("#option_1", limit=1) if form is None or len(form) == 0: continue if option is None or len(option) == 0: continue print(title) # do vote here post_url = "http://www.miui.com/" + HttpUtils.get_attr( article_soup, "#poll", "action") + "&inajax=1" post_data = dict() post_data["pollanswers[]"] = HttpUtils.get_attr( article_soup, "#option_1", "value") post_data["formhash"] = self.form_hash_mirror post_result = HttpUtils.post( post_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None current_score = self.get_score() print(previous_score) print(current_score) cnt += 1 if cnt >= max_cnt or previous_score == current_score: stop_flag = True break previous_score = current_score time.sleep(60) except: pass
def do_process(cls, link): resp = HttpUtils.get_with_retry(cls.root_url + link, headers=cls.headers) assert resp is not None cls.comic_name = HttpUtils.get_content(resp, ".detail-info-title").strip() comic_author = HttpUtils.get_content( resp, ".detail-info-tip span a").strip() comic_status = HttpUtils.get_content( resp, ".detail-info-tip span:nth-of-type(2) span").strip() titles = HttpUtils.get_contents(resp, "div.detail-list-form-con a") # validation titles = list(map(lambda x: x.strip(), titles)) if len(titles) == 0: return chap_ids = list() vol_ids = list() for title in titles: id = re.search(".+?(\d*).+?", title).group(1) if id == "": # print("Cannot parse: " + title) pass else: if "話" in title: chap_ids.append(int(id)) elif "卷" in title: vol_ids.append(int(id)) max_chap = -1 max_vol = -1 is_missed = False if len(chap_ids) > 0: missing_ids = list() chap_ids.sort() max_chap = chap_ids[-1] for i in range(1, max_chap + 1): if i not in chap_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing chapters: " + str(missing_ids)) is_missed = True if len(vol_ids) > 0: missing_ids = list() vol_ids.sort() max_vol = vol_ids[-1] for i in range(1, max_vol + 1): if i not in vol_ids: missing_ids.append(i) if len(missing_ids) > 0: # print("Missing volumes: " + str(missing_ids)) is_missed = True if not is_missed: # print(">>>>>>>>>>>> WOW! FULL SET: %s <<<<<<<<<<<<" % cls.comic_name) cls.output_pool.put((cls.comic_name, comic_author, comic_status, max_chap, max_vol, link))
def water(self): self.check_in() url_prefix = "http://www.miui.com/forum.php?mod=forumdisplay&fid=5&orderby=dateline&filter=author&orderby=dateline&page=" page = 1 cnt = 1 max_cnt = 50 chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] id_list = [] while True: soup_obj = HttpUtils.get(url_prefix + str(page)) print("new page: " + str(page)) id_list.extend(HttpUtils.get_attrs(soup_obj, "tbody", "id")) page += 1 if len(id_list) > max_cnt: break id_list = id_list[:max_cnt] for id in id_list: if not id.startswith("normalthread"): continue id = id[13:] page_url = self.page_url_template.format(id) page_soup_obj = HttpUtils.get(page_url) assert page_soup_obj is not None i = str(cnt) length = len(i) num = "" for index in range(length): num += chinese_char[int(i[index])] id_num = "" for index in range(len(id)): id_num += chinese_char[int(id[index])] random_id = str(int(random() * 1000000000000000)) chinese_char = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] random_id_num = "" for index in range(len(random_id)): random_id_num += chinese_char[int(random_id[index])] title = HttpUtils.get_content(page_soup_obj, "title").strip().replace( "_灌者为王_MIUI论坛", "") message = "时间{0},帖子ID{1},标题\"{2}\",随机数{3},第{4}个积分,打扰".format( time.strftime("%b %d %Y %H:%M:%S", time.localtime()), id_num, title, random_id_num, num) # form_hash = page_soup_obj.select("input[name='formhash']")[0]["value"] post_data = dict() post_data["posttime"] = str(int(time.time())) post_data["formhash"] = self.form_hash_mirror post_data["usesig"] = "1" post_data["subject"] = " " post_data["message"] = message form_submit_url = "http://www.miui.com/forum.php?mod=post&action=reply&fid=5&tid={0}&extra=page=1&replysubmit=yes&infloat=yes&handlekey=fastpost".format( id) # print(post_data) post_result = HttpUtils.post(form_submit_url, headers=self.site.login_headers, data=post_data, returnRaw=False) assert post_result is not None time.sleep(int(random() * 60) + 90) cnt += 1