def crawler_page_html(page_url, retry=True): # raise gen.Return( open("test.html", "rb").read() ) # DEBUG req_data = { "url": page_url, "method": "GET", "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "max-age=0", "upgrade-insecure-requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", }, "proxy_host": "192.168.206.1", "proxy_port": 1080, "request_timeout": 30, } response = yield tool.http_request(req_data) if response.code == 599 and retry: response = yield tool.http_request(req_data) if response.code != 200: raise gen.Return("") raise gen.Return(response.body)
def crawler_page_html(page_url, retry=True): req_data = { "url": page_url, "method": "GET", "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh;q=0.9,en;q=0.8", "cache-control": "max-age=0", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36", }, "proxy_host": None, "proxy_port": None, "request_timeout": 30, } response = yield tool.http_request(req_data) if response.code == 599 and retry: response = yield tool.http_request(req_data) if response.code != 200: # raise Exception("http status code %s,%s" % (response.code, response.error)) raise gen.Return("") raise gen.Return(response.body)
def get_post_page_head(post_url, postfix_list): post_page_return_code, post_page_data = tool.http_request(post_url)[:2] # 不带后缀的可以访问,则直接返回页面 # 如果无法访问,则依次访问带有后缀的页面 if post_page_return_code != 1: for postfix in postfix_list: temp_post_url = post_url + "/" + urllib2.quote(postfix) post_page_return_code, post_page_data = tool.http_request(temp_post_url)[:2] if post_page_return_code == 1: break if post_page_data is not None: return tool.find_sub_string(post_page_data, "<head", "</head>", 3) else: return None
def construct_request(self): yield self.set_new_proxy() if self.proxy_item: logging.debug("Forward request via upstream proxy %s" % self.proxy_item) # Changing the `X-Forwarded-For` (hope it works for some cases) if self.proxy_item["anoy"] != True: self.request.headers["Via"] = "NaN" self.request.headers["X-Forwarded-For"] = self.proxy_item["proxy_host"] raise tornado.gen.Return( tool.http_request({ "url": self.request.uri, "method": self.request.method, "headers": self.request.headers, "body": self.request.body or None, "proxy_host": self.proxy_item["proxy_host"], "proxy_port": self.proxy_item["proxy_port"], "request_timeout": 15, "follow_redirects": False, "allow_nonstandard_methods": True }) ) else: logging.error("Proxy server error: No available proxy.") self.set_status(self.ERROR_STATUS_CODE) self.finish("Proxy server error:\n No available proxy.") raise tornado.gen.Return(None)
def get_one_page_post_url_list(account_id, page_count): # http://moexia.lofter.com/?page=1 index_page_url = "http://%s.lofter.com/?page=%s" % (account_id, page_count) index_page_return_code, index_page = tool.http_request(index_page_url)[:2] if index_page_return_code == 1: return re.findall('"(http://' + account_id + '.lofter.com/post/[^"]*)"', index_page) return None
def check_invalid(): # 获取存放路径 config_path = os.path.join(os.getcwd(), "..\\common\\config.ini") config = robot.read_config(config_path) save_data_path = robot.get_config(config, "SAVE_DATA_PATH", "info/save.data", 3) save_data_dir = os.path.dirname(save_data_path) fee_save_data_path = os.path.join(save_data_dir, "fee.data") # 读取村存档中的收费相册列表 if not os.path.exists(fee_save_data_path): log.step("收费相册存档不存在") return fee_save_data_file = open(fee_save_data_path, "r") fee_save_data = fee_save_data_file.read() fee_save_data_file.close() fee_album_id_list = fee_save_data.strip().split(" ") new_fee_album_id_list = [] # 循环访问,判断相册是否已经被删除 for fee_album_id in fee_album_id_list: album_url = "http://meituzz.com/album/browse?albumID=%s" % fee_album_id album_page_return_code, album_page = tool.http_request(album_url)[:2] if album_page_return_code == 1: if album_page.find("<title>相册已被删除</title>") == -1: new_fee_album_id_list.append(fee_album_id) else: log.step("第%s页相册已被删除" % fee_album_id) # 重新保存 fee_save_data_file = open(fee_save_data_path, "w") fee_save_data_file.write(" ".join(new_fee_album_id_list) + " ") fee_save_data_file.close()
def get_picasaweb_page_album_id(account_id, picasaweb_url): message_page_return_code, message_page = tool.http_request(picasaweb_url)[:2] if message_page_return_code == 1: # 查找picasaweb页的album id album_archive_url = "https://get.google.com/albumarchive/pwa/%s/album/" % account_id return tool.find_sub_string(message_page, 'href="%s' % album_archive_url, '"') return None
def get_one_page_album(account_id, token): index_url = "https://plus.google.com/_/photos/pc/read/" post_data = 'f.req=[["posts",null,null,"synthetic:posts:%s",3,"%s",null],[%s,1,null],"%s",null,null,null,null,null,null,null,2]' % (account_id, account_id, GET_IMAGE_URL_COUNT, token) index_page_return_code, index_page = tool.http_request(index_url, post_data)[:2] if index_page_return_code == 1: return index_page return None
def akb(file_handle): for team_id in [1, 2, 3, 4, 12]: index_url = "http://www.akb48.co.jp/about/members/?team_id=" + str(team_id) return_code, page = tool.http_request(index_url)[:2] if return_code == 1: member_list_page = tool.find_sub_string(page, '<ul class="memberListUl">', '</ul>') if member_list_page: member_list = re.findall("<li>([\s|\S]*?)</li>", member_list_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name = tool.find_sub_string(member, '<h4 class="memberListNamej">', '</h4>') english_name = tool.find_sub_string(member, '<p class="memberListNamee">', '</p>') team_find = re.findall('<h5 class="memberListTeam">([^<]*)</h5>', member) if not japanese_name: print "error japanese_name" continue if not english_name: print "error english_name" continue if (team_id != 12 and len(team_find) != 1) or (team_id == 12 and len(team_find) != 2): print "error team_find" continue japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.split(" ", 1) team = team_find[0].strip().replace(" /", " / ") file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n") else: print "error member_list_page"
def get_one_page_blog(account_id, page_count): # http://blog.nogizaka46.com/asuka.saito blog_url = "http://blog.nogizaka46.com/%s/?p=%s" % (account_id, page_count) blog_return_code, blog_page = tool.http_request(blog_url)[:2] if blog_return_code == 1: return tool.find_sub_string(blog_page, '<div class="paginate">', '<div class="paginate">', 1) return None
def get_store_info(info): # 获取 商品基本信息 & 价格 store_api_url = STORE_URL % ( info["itemid"], MY_AREA, info["venderId"], info["cat"], str(time.time()).replace(".", ""), JQNAME, ) # ap(store_api_url) if not DEBUG: # response = requests.get(store_api_url, headers={}, timeout=16) response = yield tool.http_request({ "url": store_api_url, "method": "GET", "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", # "Referer": store_api_url, # "Pragma": "no-cache", # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", } }) # q.d() # ap("response:", response) open("store_api.js", "w").write(tool.try_decode_html_content(response.body)) store_api_content = open("store_api.js", "r").read() store_api_content_json = get_jsonp_json(store_api_content) # ap(store_api_content_json) # 取 商家 名称 vender_string = (store_api_content_json["stock"].get("self_D") or store_api_content_json["stock"].get("D") or {}).get("vender") or "自营" # 取plus的价格(一般更低)或者原价 if store_api_content_json["stock"].get("jdPrice"): price = store_api_content_json["stock"]["jdPrice"].get( "tpp") or store_api_content_json["stock"]["jdPrice"]["p"] if store_api_content_json["stock"]["jdPrice"].get("sfp"): price = min(store_api_content_json["stock"]["jdPrice"].get("sfp"), price) else: price = "-1.00" # q.d() return { "price": float(price), "vender": vender_string, "stock": store_api_content_json["stock"]["StockStateName"], }
def get_one_page_follow_list(account_id, cursor=None): query_url = "https://www.instagram.com/query/" # node支持的字段:id,is_verified,followed_by_viewer,requested_by_viewer,full_name,profile_pic_url,username params = "nodes{username},page_info" if cursor is None: post_data = "q=ig_user(%s){follows.first(%s){%s}}" % (account_id, USER_COUNT_PER_PAGE, params) else: post_data = "q=ig_user(%s){follows.after(%s,%s){%s}}" % (account_id, cursor, USER_COUNT_PER_PAGE, params) # todo session id error # IGSCdaccb7f76627fa16a0d418f32a733030cb4cdeefaaddc5464a3da52eb8acfe06%3AID8fxYoOH96eMPpf4kEWwIhLA9ihMLuO%3A%7B%22_token_ver%22%3A2%2C%22_auth_user_id%22%3A3539660450%2C%22_token%22%3A%223539660450%3Amm50iieIxyG0NWWxuFifs0j23vhA5WpR%3Afd860ccd5c16e35eadf3e0946c00178b50fce7b45a9d09c62498dbbffdc8fa2b%22%2C%22asns%22%3A%7B%2247.89.39.193%22%3A45102%2C%22time%22%3A1480388199%7D%2C%22_auth_user_backend%22%3A%22accounts.backends.CaseInsensitiveModelBackend%22%2C%22last_refreshed%22%3A1480392303.831638%2C%22_platform%22%3A4%2C%22_auth_user_hash%22%3A%22%22%7D header_list = { "Referer": "https://www.instagram.com/", "X-CSRFToken": CSRF_TOKEN, "Cookie": "csrftoken=%s; sessionid=%s;" % (CSRF_TOKEN, SESSION_ID), } follow_list_return_code, follow_list_data = tool.http_request(query_url, post_data, header_list)[:2] if follow_list_return_code == 1: try: follow_list_data = json.loads(follow_list_data) except ValueError: pass else: if robot.check_sub_key(("follows",), follow_list_data): if robot.check_sub_key(("page_info", "nodes"), follow_list_data["follows"]): if robot.check_sub_key(("end_cursor", "has_next_page"), follow_list_data["follows"]["page_info"]): return follow_list_data["follows"] return None
def get_one_page_post(coser_id, page_count): # http://bcy.net/u/50220/post/cos?&p=1 post_url = "http://bcy.net/u/%s/post/cos?&p=%s" % (coser_id, page_count) post_page_return_code, post_page = tool.http_request(post_url)[:2] if post_page_return_code == 1: return post_page return None
def get_image_url_list(cp_id, rp_id): # http://bcy.net/coser/detail/9299/36484 rp_url = "http://bcy.net/coser/detail/%s/%s" % (cp_id, rp_id) rp_page_return_code, rp_page_response = tool.http_request(rp_url)[:2] if rp_page_return_code == 1: return re.findall("src='([^']*)'", rp_page_response) return None
def jkt(file_handle): index_url = "http://www.jkt48.com/member/list" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: page = tool.find_sub_string(page, '<div id="mainCol">', "<!--end #mainCol-->", 1) start_index = 0 start_index_list = [] while start_index != -1: start_index = page.find('<a name="', start_index + 1) start_index_list.append(start_index) for i in range(0, len(start_index_list) - 1): start = start_index_list[i] end = start_index_list[i + 1] if end == -1: end = len(page) split_page = page[start: end] team_name = tool.find_sub_string(split_page, "<h2>", "</h2>") if team_name.find("Team") == -1: team_name = "Team kenkyusei" team_name = "JKT48 " + team_name member_list = re.findall('<div class="profileWrap">([\s|\S]*?)</div><!--/loop-->',split_page) for member in member_list: member = member.replace("<br>", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name = english_name = tool.find_sub_string(member, 'alt="', '"') file_handle.write(japanese_name + "\t" + english_name + "\t" + team_name + "\n")
def ske(file_handle): split_list = { "SKE48 Team S": ("<!-- LIST - TEAM S -->", "<!-- //LIST - TEAM S -->"), "SKE48 Team KII": ("<!-- LIST - TEAM KII -->", "<!-- //LIST - TEAM KII -->"), "SKE48 Team E": ("<!-- LIST - TEAM E -->", "<!-- //LIST - TEAM E -->"), "SKE48 Team Kenkyusei": ("<!-- LIST - KENKYUSEI -->", "<!-- //LIST - KENKYUSEI -->") } index_url = "http://www.ske48.co.jp/profile/list.php" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: for team_name in split_list: team_page = tool.find_sub_string(page, split_list[team_name][0], split_list[team_name][1]) member_list = re.findall('<dl>([\s|\S]*?)</dl>', team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") japanese_name_find = re.findall('<h3><a href="./\?id=[^"]*">([^<]*)</a></h3>', member) english_name = tool.find_sub_string(member, '<h3 class="en">', '</h3>') plus_text = tool.find_sub_string(member, '<li class="textPlus">', '</li>') if len(japanese_name_find) != 1: print "error japanese_name_find" continue if not english_name: print "error english_name" continue japanese_name = japanese_name_find[0].replace(" ", "") first_name, last_name = english_name.strip().title().split(" ", 1) if plus_text and plus_text.find("兼任") > 0: team = team_name + " / " + plus_text.split("/")[-1].strip().replace("チーム", " Team ").replace("兼任", "") else: team = team_name file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team + "\n")
def hkt(file_handle): index_url = "http://www.hkt48.jp/profile/" return_code, page = tool.http_request(index_url)[:2] if return_code == 1: team_find = re.findall('(<h3>[\s|\S]*?)<!-- / .contsbox --></div>', page) for team_page in team_find: team = tool.find_sub_string(team_page, "<h3>", "</h3>") if not team: print "error team" continue team = team.strip() member_list = re.findall("<li>([\s|\S]*?)</li>", team_page) for member in member_list: member = member.replace("<br />", "").replace("\n", "").replace("\r", "").replace("\t", "") name_find = re.findall('''<a href="/profile/[\d]*"><img src="[^"]*" alt="[^"]*" width="120" height="150" /><span class='name_j'>([^"]*)</span><span class='name_e'>([^<]*)</span></a> ''', member) if len(name_find) != 1: print "error name_find" continue japanese_name, english_name = name_find[0] team_plus_find = re.findall('<div class="team_j">([^<]*)</div>', member) team_name = team if len(team_plus_find) == 1: if team_plus_find[0].find("兼任") >= 0: team_name = team + " / " + team_plus_find[0].split("/")[-1].strip().replace("兼任", "") japanese_name = japanese_name.replace(" ", "") first_name, last_name = english_name.strip().title().split(" ", 1) file_handle.write(japanese_name + "\t" + last_name + " " + first_name + "\t" + team_name + "\n")
def get_one_page_audio_list(account_id, page_type, page_count): # http://changba.com/member/personcenter/loadmore.php?userid=4306405&pageNum=1 audio_album_url = "http://5sing.kugou.com/%s/%s/%s.html" % (account_id, page_type, page_count) audio_album_return_code, audio_album_page = tool.http_request(audio_album_url)[:2] if audio_album_return_code == 1: return re.findall('<a href="http://5sing.kugou.com/' + page_type + '/([\d]*).html" [\s|\S]*? title="([^"]*)">', audio_album_page) return None
def get_api_info(account_name): photo_index_url = "https://www.flickr.com/photos/%s" % account_name photo_index_return_code, photo_index_page = tool.http_request(photo_index_url)[:2] if photo_index_return_code == 1: user_id = tool.find_sub_string(photo_index_page, '"nsid":"', '"') site_key = tool.find_sub_string(photo_index_page, '"site_key":"', '"') return {"user_id": user_id, "site_key": site_key} return None
def unfollow(account_id): unfollow_url = "http://bcy.net/weibo/Operate/follow?" unfollow_post_data = {"uid": account_id, "type": "unfollow"} unfollow_return_code, unfollow_return_data = tool.http_request(unfollow_url, unfollow_post_data)[:2] if unfollow_return_code == 1: if int(unfollow_return_data) == 1: return True return False
def get_suid(account_id): index_page_url = "http://www.miaopai.com/u/paike_%s" % account_id index_page_return_code, index_page = tool.http_request(index_page_url)[:2] if index_page_return_code == 1: suid = tool.find_sub_string(index_page, '<button class="guanzhu gz" suid="', '" heade="1" token="">+关注</button>') if suid: return suid return None
def get_user_id(account_id): index_url = "http://changba.com/u/%s" % account_id index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code == 1: user_id = tool.find_sub_string(index_page, "var userid = '", "'") if user_id: return user_id return None
def get_account_id(account_name): account_index_url = "https://twitter.com/%s" % account_name account_index_return_code, account_index_page = tool.http_request(account_index_url)[:2] if account_index_return_code == 1: account_id = tool.find_sub_string(account_index_page, '<div class="ProfileNav" role="navigation" data-user-id="', '">') if account_id: return account_id return None
def follow(account_id): follow_url = "http://bcy.net/weibo/Operate/follow?" follow_post_data = {"uid": account_id, "type": "dofollow"} follow_return_code, follow_return_data = tool.http_request(follow_url, follow_post_data)[:2] if follow_return_code == 1: # 0 未登录,11 关注成功,12 已关注 if int(follow_return_data) == 12: return True return False
def unfollow_account(auth_token, account_id): unfollow_url = "https://twitter.com/i/user/unfollow" unfollow_data = {"user_id": account_id} header_list = {"Cookie": "auth_token=%s;" % auth_token, "Referer": "https://twitter.com/"} unfollow_return_code, unfollow_data = tool.http_request(unfollow_url, unfollow_data, header_list)[:2] if unfollow_return_code == 1: if robot.check_sub_key(("new_state",), unfollow_data) and unfollow_data["new_state"] == "not-following": return True return False
def get_thread_author_post(thread_url): thread_return_code, thread_page, thread_response = tool.http_request(thread_url) if thread_return_code == 1: content_type = tool.get_response_info(thread_response.info(), "Content-Type") charset = tool.find_sub_string(content_type, "charset=") post_message = tool.find_sub_string(thread_page, '<td class="t_f" id="postmessage_', '<div id="comment_') post_message = post_message[post_message.find('">') + 2: post_message.rfind("</td>")] return post_message.decode(charset) return None
def get_promote_info(info): # 获取价格以及 促销 & 券 & 礼物 promote_api_url = PROMOTE_URL % ( MY_AREA[0], MY_AREA[1], MY_AREA[2], info["itemid"], info["分类id"], int(time.time() * 1000), ) # 获取页面html内容 if not DEBUG: response = yield tool.http_request({ "url": promote_api_url, "method": "GET", "headers": HEADERS }) open("kaola.promopt_page.html", "w").write(tool.try_decode_html_content(response.body)) item_content = open("kaola.promopt_page.html", "r").read() item_content = tool.json_load(item_content) # 这两个不是一模一样的吗 skuPrice = item_content["data"].get( "skuPrice") or item_content["data"]["skuDetailList"][0]["skuPrice"] min_price = min(skuPrice["currentPrice"], skuPrice["kaolaPrice"], skuPrice["suggestPrice"], skuPrice["marketPrice"]) presale = item_content["data"].get( "depositGoodsAdditionalInfo" ) or item_content["data"]["skuDetailList"][0]["depositSkuAdditionalInfo"] if presale: min_price = presale.get("handPrice") or min_price current_store = item_content["data"].get( "goodsCurrentStore" ) or item_content["data"]["skuDetailList"][0]["skuStore"]["currentStore"] promotion_info = item_content["data"].get("promotionList") or item_content[ "data"]["skuDetailList"][0]["promotionList"] or [] promote = [[x["promotionContent"], x["promotionUrl"], "0000 ~ 0000"] for x in promotion_info] quan = item_content["data"].get("goodsCouponList") or [] # q.d() return { "min_price": min_price, "current_store": current_store, "promote": promote, "quan": quan, "presale": bool(presale), }
def check_big_image(image_url, big_2_small_list): if image_url in big_2_small_list: big_image_display_page_return_code, big_image_display_page = tool.http_request(big_2_small_list[image_url])[:2] if big_image_display_page_return_code == 1: temp_image_url = tool.find_sub_string(big_image_display_page, '<img src="', '"') if temp_image_url != "/img/expired.gif": return temp_image_url, False else: return image_url, True # 如果有发现一个已经过期的图片,那么再往前的图片也是过期的,不用再检查了 return image_url, False
def get_one_page_diary_data(account_id, page_count): # http://www.keyakizaka46.com/mob/news/diarKiji.php?cd=member&ct=01&page=0&rw=20 diary_page_url = "http://www.keyakizaka46.com/mob/news/diarKiji.php" diary_page_url += "?cd=member&ct=%02d&page=%s&rw=%s" % (int(account_id), page_count - 1, IMAGE_COUNT_PER_PAGE) diary_return_code, diary_page = tool.http_request(diary_page_url)[:2] if diary_return_code == 1: diary_page = tool.find_sub_string(diary_page, '<div class="box-main">', '<div class="box-sideMember">') if diary_page: return re.findall("<article>([\s|\S]*?)</article>", diary_page) return None
def get_bbs_forum_url_list(index_url): index_return_code, index_page = tool.http_request(index_url)[:2] if index_return_code == 1: forum_find = re.findall('<a href="(forum-\w*-\d*.\w*)"[^>]*>([\S]*)</a>', index_page) host = index_url[0: index_url.rfind("/") + 1] forum_url_list = {} for forum_path, forum_name in forum_find: forum_url_list[host + forum_path] = forum_name return forum_url_list return None
def save_video(ts_file_list, file_path): file_handle = open(file_path, "wb") for ts_file_url in ts_file_list: ts_file_return_code, ts_file_data = tool.http_request(ts_file_url)[:2] if ts_file_return_code == 1: file_handle.write(ts_file_data) else: return False file_handle.close() return True
def get_ts_url_list(file_url, ts_file_list): file_return_code, file_data = tool.http_request(file_url)[:2] if file_return_code == 1: new_file_url_list = re.findall("(/ext_tw_video/[\S]*)", file_data) for new_file_url in new_file_url_list: new_file_url = "https://video.twimg.com%s" % new_file_url if new_file_url.split(".")[-1] == "m3u8": get_ts_url_list(new_file_url, ts_file_list) elif new_file_url.split(".")[-1] == "ts": ts_file_list.append(new_file_url)
def get_ts_url_list(link_url): video_link_return_code, video_link_data = tool.http_request(link_url)[:2] if video_link_return_code == 1: ts_id_list = re.findall("([\S]*.ts)", video_link_data) prefix_url = link_url[:link_url.rfind("/") + 1] ts_file_list = [] for ts_id in ts_id_list: ts_file_list.append(prefix_url + ts_id) return ts_file_list else: return None
def get_one_forum_page_thread_url_list(forum_url): forum_return_code, forum_page = tool.http_request(forum_url)[:2] if forum_return_code == 1: forum_page = tool.find_sub_string(forum_page, '<div id="threadlist"', '<div id="filter_special_menu"', 1) thread_find = re.findall('<a href="(thread-\d*-1-1.\w*)" onclick="atarget\(this\)" class="s xst">([\S|\s]*?)</a>', forum_page) host = forum_url[0: forum_url.rfind("/") + 1] thread_url_list = {} for forum_path, forum_name in thread_find: thread_url_list[host + forum_path] = forum_name return thread_url_list return None
def get_video_url_list(tweet_id): video_page_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id video_page_return_code, video_page = tool.http_request(video_page_url)[:2] if video_page_return_code == 1: m3u8_file_url = tool.find_sub_string(video_page, ""video_url":"", """) if m3u8_file_url: m3u8_file_url = m3u8_file_url.replace("\\/", "/") ts_url_list = [] get_ts_url_list(m3u8_file_url, ts_url_list) return "ts", ts_url_list vmap_file_url = tool.find_sub_string(video_page, ""vmap_url":"", """) if vmap_file_url: vmap_file_url = vmap_file_url.replace("\\/", "/") vmap_file_return_code, vmap_file = tool.http_request(vmap_file_url)[:2] if vmap_file_return_code: media_file_url = tool.find_sub_string(vmap_file, "<![CDATA[", "]]>") if media_file_url: file_type = media_file_url.split(".")[-1].split("?")[0] return file_type, media_file_url return "", []
def construct_http_get(proxy_host, proxy_port, timeout): return tool.http_request({ "url": URL, "method": "GET", "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36", }, "body": None, "proxy_host": proxy_host, "proxy_port": proxy_port, "request_timeout": timeout, })
def crawler_page_html(page_url, retry=True): # raise gen.Return( open("test.html", "rb").read() ) # DEBUG req_data = { "url": page_url, "method": "GET", "headers": { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "accept-encoding": "gzip, deflate", "accept-language": "zh-CN,zh;q=0.9", "cache-control": "max-age=0", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36", "Connection": "keep-alive", "Cookie": "ASPSESSIONIDASCTATQD=HLEJHLFCBJGLBDACDDJMMAHI; UM_distinctid=16426d109c7134-042982a838072c-5b183a13-1fa400-16426d109c832a; ASPSESSIONIDACAQTRDQ=KDGLGOOAPFCDAAPFELIODBBD; CNZZDATA1256284042=1049911583-1529656073-http%253A%252F%252Fwww.89ip.cn%252F%7C1530003389", }, "proxy_host": None, "proxy_port": None, "request_timeout": 30, } response = yield tool.http_request(req_data) if response.code == 599 and retry: response = yield tool.http_request(req_data) if response.code != 200: # raise Exception("http status code %s,%s" % (response.code, response.error)) raise gen.Return("") # open("test.html", "wb").write(response.body) # DEBUG raise gen.Return(response.body)
def construct_http_post(proxy_host, proxy_port, timeout): return tool.http_request({ "url": URL, "method": "POST", "headers": { "Content-Length": "17", "Content-Type": "application/x-www-form-urlencoded", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3282.119 Safari/537.36", }, "body": "username=bogeming", "proxy_host": proxy_host, "proxy_port": proxy_port, "request_timeout": timeout, })
def get_presale_info(info): # 获取 商品基本信息 & 价格 presale_api_url = PRESALE_URL % ( info["itemid"], str(time.time()).replace(".", ""), JQNAME, ) ap(presale_api_url) if not DEBUG: # response = requests.get(presale_api_url, headers={}, timeout=16) response = yield tool.http_request({ "url": presale_api_url, "method": "GET", "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", # "Referer": presale_api_url, # "Pragma": "no-cache", # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", } }) # q.d() # ap("response:", response) open("presale_api.js", "w").write(tool.try_decode_html_content(response.body)) presale_api_content = open("presale_api.js", "r").read() presale_api_content_json = get_jsonp_json(presale_api_content) # ap(presale_api_content_json) # q.d() if not presale_api_content_json.get("ret"): return None return { "currentPrice": presale_api_content_json["ret"]["currentPrice"], "presaleStartTime": presale_api_content_json["ret"]["presaleStartTime"], "presaleEndTime": presale_api_content_json["ret"]["presaleEndTime"], # "balanceBeginTime": presale_api_content_json["ret"]["balanceBeginTime"], # "balanceEndTime": presale_api_content_json["ret"]["balanceEndTime"], }
def get_base_info(item): # 获取页面html内容 if not DEBUG: response = yield tool.http_request({ "url": item["url"], "method": "GET", "headers": HEADERS }) open("kaola.base_url_page.html", "w").write(tool.try_decode_html_content(response.body)) item_content = open("kaola.base_url_page.html", "r").read() item_content_lines = item_content.split("\n") icat = next( (i for (i, x) in enumerate(item_content_lines) if "$addGoods" in x), -1) info_text = item_content_lines[icat + 1:icat + 12] for i, line in enumerate(info_text): if "," in info_text[i]: info_text[i] = info_text[i][:info_text[i].index(",")] info_text[i] = info_text[i].replace("'", "").strip() else: ap("[WARN]:", "Something unexpected happened.") info_text[i] = "" info = { "分类id": info_text[0], "品牌id": info_text[1], "商品名称": info_text[2], "itemid": info_text[3], "商品售价": info_text[4], # "商品图片": info_text[5], "分类名": info_text[6], "品牌名": info_text[7], "商品库存": info_text[8], "网络价": info_text[9], # "收藏人数": info_text[10], } return info
def get_base_info(item): # 获取页面html内容 if not DEBUG: # response = requests.get(item["url"], headers={}, timeout=16) response = yield tool.http_request({ "url": item["url"], "method": "GET", "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", "Referer": item["url"], "Pragma": "no-cache", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", } }) # q.d() open("content_page.html", "w").write(tool.try_decode_html_content(response.body)) item_content = open("content_page.html", "r").read() item_content_lines = item_content.split("\n") icat = next( (i for (i, x) in enumerate(item_content_lines) if "cat: [" in x), -1) info = get_item_neighbor(item_content_lines, icat) for line in item_content_lines[:20]: if "<title>" in line: info["name"] = re.sub( r"""([\W]*<title>|[【][^【]*[】][^】]*</title>[\W]*$)""", "", line) return info
def get_base_info(item): # 获取页面html内容 if not DEBUG: response = yield tool.http_request({ "url": item["url"], "method": "GET", "headers": HEADERS }) open("yanxuan.base_url_page.html", "w").write(tool.try_decode_html_content(response.body)) item_content = open("yanxuan.base_url_page.html", "r").read() item_content_lines = item_content.split("\n") icat = next( (i for (i, x) in enumerate(item_content_lines) if "\"item\":" in x), -1) info_text = item_content_lines[icat][7:-1] info_json = tool.json_load(info_text) # info_text = info_text.replace("\"item\":", "") # if info_text[-1] == ",": # info_text = info_text[0:-1] if item.get("iid"): item_info = next( (x for x in info_json["skuList"] if x["id"] == item["iid"]), {}) else: item_info = info_json["skuList"][item["index"]] if not item_info: return None promote_info = item_info.get("hdrkDetailVOList") if item_info.get("couponShortNameList"): quan_info = item_info.get("couponShortNameList") elif item_info.get("shortCouponList"): quan_info = [x["displayName"] for x in item_info["shortCouponList"]] else: quan_info = None price = min(item_info["retailPrice"], item_info["counterPrice"], item_info["calcPrice"], item_info["preSellPrice"]) if item_info.get("spmcBanner"): spmc_price = float(item_info["spmcBanner"].get("spmcPrice") or 0) price = spmc_price > 0 and min(spmc_price, price) or price if item_info.get("detailPromBanner"): activity_price = float( item_info["detailPromBanner"].get("activityPrice") or 0) price = activity_price > 0 and min(activity_price, price) or price info = { "name": item_info["skuTitle"], "iid": item_info["id"], "promote": [[x["name"], x["huodongUrlPc"], "0 ~ 0"] for x in promote_info], "quan": quan_info, "price": price, "store": item_info["sellVolume"], } return info
def get_promote_info(info): # 获取价格以及 促销 & 券 & 礼物 promote_api_url = PROMOTE_URL % ( info["itemid"], MY_AREA, info["shopId"], info["venderId"], info["cat"].replace(",", "%2C"), info["price"], str(time.time()).replace(".", "")[:-3], JQNAME, ) # promote_api_url = """https://cd.jd.com/promotion/v2?callback=jQuery5415158&skuId=65610440044&area=19_1601_3635_0&shopId=10131385&venderId=10252350&cat=1672%%2C2599%%2C12078&isCanUseDQ=1&isCanUseJQ=1&platform=0&orgType=2&jdPrice=299.00&appid=1&_=%s""" % str(time.time()).replace(".", "")[:-3] # promote_api_url = """https://cd.jd.com/promotion/v2?callback=jQuery4255721&skuId=65610440044&area=19_1601_3635_0&shopId=10131385&venderId=10252350&cat=1672%%2C2599%%2C12078&isCanUseDQ=1&isCanUseJQ=1&platform=0&orgType=2&jdPrice=299.00&appid=1&_=%s""" % str(time.time()).replace(".", "")[:-3] # ap(promote_api_url) if not DEBUG: # response = requests.get(promote_api_url, headers={}, timeout=16) response = yield tool.http_request({ "url": promote_api_url, "method": "GET", "headers": { # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", # "Accept-Encoding": "gzip, deflate, br", # "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", "accept": "*/*", "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6,ja;q=0.5", "Referer": "https://item.jd.com/65610440044.html", "Pragma": "no-cache", # 额,必须要有 cookie 了 "cookie": "__jdv=76161171|direct|-|none|-|1614594740019; __jdu=1614594740018869872184; areaId=19; ipLoc-djd=19-1601-3633-0; PCSYCityID=CN_440000_440100_440106; shshshfpa=00883b4f-d3c1-1602-7cd6-17731ed20a6e-1614594741; shshshfpb=m8UQnw74GyqJycpcp0lvCLg%3D%3D; __jda=122270672.1614594740018869872184.1614594740.1614594740.1614594740.1; __jdc=122270672; 3AB9D23F7A4B3C9B=RE4QF44JWCVUXEC7MQAZGA24NVF27LEI6CEQC4P7SABGXROC4ZDLKLWQBR6ULUZOEYHS5I7WMZBDNH5KDNWYC7VZFY; shshshfp=0263d234510f0c11eede903101b88cca; shshshsID=4122aa91f19c1d2e5fbd4fbec3deab0d_3_1614594756699; __jdb=122270672.4.1614594740018869872184|1.1614594740", "sec-fetch-dest": "script", "sec-fetch-mode": "no-cors", "sec-fetch-site": "same-site", # "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.192 Safari/537.36", } }) open("promote_api.js", "w").write(tool.try_decode_html_content(response.body)) promote_api_content = open("promote_api.js", "r").read() promote_api_content_json = get_jsonp_json(promote_api_content) # ap(promote_api_content_json) # 行间广告 ads_strings = [ x["ad"].replace("<", "<").replace(">", ">") for x in promote_api_content_json.get("ads") or [] ] # 促销活动 promote_strings = map( lambda x: [ x["content"], GET_PROMOTE_URL % x["pid"].split("_")[0], "%s ~ %s" % (tool.get_datetime_from_stamp(x["st"]), tool.get_datetime_from_stamp(x["d"])), ], promote_api_content_json["prom"]["pickOneTag"]) promote_strings = list(promote_strings) # ap(promote_strings) # 赠品 礼物 gift_strings = [] for tag in promote_api_content_json["prom"]["tags"]: if "gifts" in tag: gift_string = map(lambda x: [x["nm"], CONTENT_URL % x["sid"]], tag["gifts"]) gift_string = list(gift_string) gift_strings.append([ tag["name"], tool.get_datetime_from_stamp(tag["d"]), gift_string ]) elif "name" in tag: gift_strings.append( [tag["name"], tool.get_datetime_from_stamp(tag["d"])]) # 返券 返点 feedback_strings = "" if promote_api_content_json.get("quan"): feedback_url = promote_api_content_json["quan"]["actUrl"] or ( MFDETAIL % promote_api_content_json["quan"]["activityId"]) if feedback_url[:2] == "//": feedback_url = "https:%s" % feedback_url feedback_strings = [ feedback_url, promote_api_content_json["quan"]["title"] ] # 领取优惠券 使用优惠券 quan_strings = [] if promote_api_content_json.get("skuCoupon"): for item in promote_api_content_json["skuCoupon"]: quan_string = item.get("allDesc") or "满%s减%s" % (item["quota"], item["discount"]) quan_strings.append([ quan_string, "%s ~ %s" % (item.get("beginTime") or "", item.get("endTime") or "") ]) quan_strings[-1].append(item.get("key")) quan_strings[-1].append(item.get("url") or "") # q.d() return { "promote": promote_strings, "gift": gift_strings, "quan": quan_strings, "feedback": feedback_strings, "ads": ads_strings, }
def get_page(self, group, i): """ [] """ page_url = PAGE_STRUCT % (group, i * 25) tool.aprint("doing:", page_url) res = yield tool.http_request({ "url": page_url, "method": "GET", "headers": { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh-TW;q=0.7,zh;q=0.6", "Referer": page_url, "Pragma": "no-cache", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36", } }) open("www.douban.com.html", "wb").write(res.body) content = open("www.douban.com.html", "r").read() # news_list content_dom = pyquery.parse_content_to_dom(content) content_ele = content_dom.find("#content").find("table>tr:gt(0)") result_list = [] for i, content in pyquery.iter_eles(content_ele): title = content.find("td").eq(0).find("a").attr("title") href = content.find("td").eq(0).find("a").attr("href") author = content.find("td").eq(1).find("a").text() author_href = content.find("td").eq(1).find("a").attr("href") comment = content.find("td").eq(2).text() date = content.find("td").eq(3).text() result_list.append({ "group": group, "group_name": self.name_list[group], "title": title, "href": href, "author": author, "comment": comment or "0", }) # date: 2014-05-31 || 08-23 15:29 if ":" in date: date = "%s-%s:00" % (tool.get_date_string()[:4], date) else: date = "%s 00:00:00" % (date) result_list[-1]["date"] = date result_list[-1]["id"] = int(href.split("/")[-2]) result_list[-1]["author_id"] = author_href.split("/")[-2] result_list[-1]["price"] = self.get_price_from_item( result_list[-1]) tool.aprint("result:", len(result_list)) # return result_list raise tornado.gen.Return(result_list)