def prepare_js(self): ''' prepare js ''' pre_text = basic_req(self.JD_URL, 3) INDEX_JS_URL = re.findall(r'src=.*index\.js.*" t', pre_text)[0].split('"')[1] origin_js = basic_req(INDEX_JS_URL, 3) ''' decoder js ''' decode_js = codecs.unicode_escape_decode(origin_js)[0] ''' params replace ''' replace_list_str = decode_js.split(';')[2] empty_index = replace_list_str.index(' ') + 1 begin_index = replace_list_str.index('=[') + 2 end_index = replace_list_str.index(']') replace_list = replace_list_str[begin_index:end_index].split(',') rp = replace_list_str[empty_index:begin_index - 2] for ii, jj in enumerate(replace_list): decode_js = decode_js.replace('{}[{}]'.format(rp, ii), jj) self.slat = replace_list[46].replace('"', '') echo(2, 'salt', self.slat) ''' load to local ''' with open(decoder_js_path, 'w') as f: f.write(';\n'.join(decode_js.split(';'))) ''' del function about ajax ''' del_str = re.findall(r'_.{6,10}\["ajaxPrefilter.*\)\}\}\)', decode_js) del_begin_index = decode_js.index(del_str[0]) result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] result_js = decode_js[:del_begin_index] + \ decode_js[del_begin_index + len(del_str[0]):] self.result_js = result_js self.js_compile = execjs.compile(open(hotel_js_path).read()) echo(1, 'Load hotel index js success!!!')
def test_change_youdaoyun(self, atricle_id, body, article_name): """ change youdaoyun article demo @param 'buildmd/data/cookie': cookie in youdaoyun web @param article_id: change article No. @param: body: change article body @param: article_name: change article name """ url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8' headers = { 'pragma': 'no-cache', 'cache-control': 'no-cache', 'Cookie': '', 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', 'Accept': 'application/json, text/plain, */*', "Accept-Encoding": "", "Accept-Language": "zh-CN,zh;q=0.9", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3682.0 Safari/537.36", 'Origin': 'https://note.youdao.com', 'Referer': 'https://note.youdao.com/web' } if not os.path.exists('%scookie' % data_dir): print('Youdao Note cookie not exist!!!') return with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() headers['cookie'] = cookie[:-1] headers['Host'] = url.split('/')[2] file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8' file_data = {'cstk': 'E3CF_lx8'} ca = basic_req(file_list_url, 11, data=file_data, header=headers) if not len(ca): print('List Error') return change_data_origin = ca[atricle_id]['fileEntry'] body_string = [ '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>', body, '</text><inline-styles/><styles/></para></body></note>' ] change_data = { 'name': article_name, 'fileId': change_data_origin['id'], 'parentId': change_data_origin['parentId'], 'domain': change_data_origin['domain'], 'rootVersion': -1, 'sessionId': '', 'modifyTime': int(round(time.time())), 'bodyString': "".join(body_string), 'transactionId': change_data_origin['id'], 'transactionTime': int(round(time.time())), 'orgEditorType': change_data_origin['orgEditorType'], 'tags': change_data_origin['tags'], 'cstk': 'E3CF_lx8' } print(change_data) cb = basic_req(url, 12, data=change_data, header=headers) return cb
def test_change_youdaoyun(self, atricle_id, body, article_name): """ change youdaoyun article demo @param 'buildmd/data/cookie': cookie in youdaoyun web @param article_id: change article No. @param: body: change article body @param: article_name: change article name """ url = 'https://note.youdao.com/yws/api/personal/sync?method=push&keyfrom=web&cstk=E3CF_lx8' headers = { 'Cookie': '', 'Content-Type': get_content_type(), 'Accept': get_accept('xhr'), 'Origin': 'https://note.youdao.com', 'Referer': 'https://note.youdao.com/web' } if not os.path.exists('%scookie' % data_dir): print('Youdao Note cookie not exist!!!') return with codecs.open('%scookie' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readline() headers['cookie'] = cookie[:-1] headers['Host'] = url.split('/')[2] file_list_url = 'https://note.youdao.com/yws/api/personal/file?method=listRecent&offset=0&limit=30&keyfrom=web&cstk=E3CF_lx8' file_data = {'cstk': 'E3CF_lx8'} ca = basic_req(file_list_url, 11, data=file_data, header=headers) if not len(ca): print('List Error') return change_data_origin = ca[atricle_id]['fileEntry'] body_string = [ '<?xml version="1.0"?><note xmlns="http://note.youdao.com" schema-version="1.0.3" file-version="0"><head/><body><para><coId>12-1550424181958</coId><text>', body, '</text><inline-styles/><styles/></para></body></note>' ] change_data = { 'name': article_name, 'fileId': change_data_origin['id'], 'parentId': change_data_origin['parentId'], 'domain': change_data_origin['domain'], 'rootVersion': -1, 'sessionId': '', 'modifyTime': int(round(time.time())), 'bodyString': "".join(body_string), 'transactionId': change_data_origin['id'], 'transactionTime': int(round(time.time())), 'orgEditorType': change_data_origin['orgEditorType'], 'tags': change_data_origin['tags'], 'cstk': 'E3CF_lx8' } print(change_data) cb = basic_req(url, 12, data=change_data, header=headers) return cb
def get_request(self, url: str, types: int, functs, header: dict = {}): if len(header): req = basic_req(url, types, header=header) else: req = basic_req(url, types) if functs(req): if can_retry(url): self.get_request(url, types, functs, header) return return req
def get_goods_id_first(self, origin_url, index): """ get goods id first """ origin_url = origin_url.replace('https', 'http') # first_result = proxy_req(origin_url, 0) first_result = basic_req(origin_url, 0, header=self.headers) if not first_result or len(first_result.find_all('script')) < 2: if can_retry(origin_url): self.get_goods_id_first(origin_url, index) return wait = first_result.find_all('script')[1].text if not '"title":"' in wait: return title = re.findall('"title":".*","', wait)[0].split('","')[0].split('":"')[1] if title in self.title2map: self.goods_map[index] = self.title2map[title] self.url2goods[origin_url] = self.title2map[title] print(self.title2map[title]) else: print(title)
def _getroom_id(self, next_to=True, proxy=True): ''' get av room id ''' url = self.ROOM_INIT_URL % self._av_id html = proxy_req(url, 0) if proxy else basic_req(url, 0) head = html.find_all('head') if not len(head) or len( head[0].find_all('script')) < 4 or not '{' in head[0].find_all( 'script')[3].text: if can_retry(url): self._getroom_id(proxy=proxy) else: self._getroom_id(proxy=False) next_to = False if next_to: script_list = head[0].find_all('script')[3].text script_begin = script_list.index('{') script_end = script_list.index(';') script_data = script_list[script_begin:script_end] json_data = json.loads(script_data) if self._p == -1 or len(json_data['videoData']['pages']) < self._p: self._room_id = json_data['videoData']['cid'] else: self._room_id = json_data['videoData']['pages'][self._p - 1]['cid'] print('Room_id:', self._room_id)
def summarization_once(self, index): """ get html from news """ print(index) texts = [] hrefs = [] if index: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \ str(index * 10) else: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2' news_lists = basic_req(url, 0) href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF']) summarization_lists = news_lists.find_all('div', class_='gG0TJc') if not len(href_lists) or not len(summarization_lists): if can_retry(url): self.summarization_once(index) return print('num: ', len(summarization_lists), url) for href in href_lists: hrefs.append(href['href']) for summarization in summarization_lists: temp_text = summarization.text.replace('\n', '').replace( '\xa0', '').replace('\t', '').replace('...', '').strip() temp_text = ' '.join(temp_text.split()) texts.append(temp_text) self.summarizations[int(index)] = texts self.hrefs[int(index)] = hrefs
def update_article(self, article_id: str, article_body: str): p = self.share2article[article_id][-2].split("/")[-1] article_info = self.list_recent[p] data = { "fileId": p, "parentId": article_info["parentId"], "domain": article_info["domain"], "rootVersion": -1, "sessionId": "", "modifyTime": int(time_stamp()), "bodyString": article_body, "transactionId": p, "transactionTime": int(time_stamp()), "orgEditorType": article_info["orgEditorType"], "tags": article_info["tags"], "cstk": self.cstk, } url = self.SYNC_URL % ("push", self.cstk) req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1)) if req is None or list(req.keys()) != [ "entry", "meta", "effectedShareEntries", "forcePullVersion", "effected", ]: echo( "0|error", "Update atricle_id {} Error".format(article_id), req.json() if req is not None else "", ) return False echo("1|warning", "Update atricle_id {} Success!!!".format(article_id)) return True
def req_ip66(): ''' 66ip.cn js decoder ''' header['Cookie'] = generate_cookie() req_text = basic_req(IP66_URL, 3, header=header) echo(2, req_text) return req_text
def generate_cookie(): ''' eval 66ip.cn test in 19.5.7 ''' req = basic_req(IP66_URL, 2, header=header) basic_cookie = req.cookies.get_dict() ''' !important \b in py -> \x80 ''' req_text = r'{}'.format(req.text) ''' get the script will be eval ''' script_text = re.findall('<script>(.*?)</script>', req_text)[0] script_text = script_text.replace( '{eval(', '{aaa=').replace(');break', ';break') script_eval = r'{}'.format(js2py.eval_js(script_text + 'aaa')) echo(0, script_eval) try: ''' replace document & window ''' params = re.findall( r'(__jsl_clearance=.*?)\'\+\(function\(\){(.*?join\(\'\'\))}\)\(\)', script_eval) wait_eval = params[0][1].replace( "document.createElement('div')", "{}").replace("", '') wait_replace = re.findall( r'=(.{1,5}\.firstChild\.href;)', wait_eval)[0] wait_eval = wait_eval.replace(wait_replace, '"http://www.66ip.cn/";') ''' eval & encoder cookie ''' other_param = js2py.eval_js( 'function ddd() {window={};' + wait_eval + '}ddd()') cookie = '{}; {}{}'.format(encoder_cookie( basic_cookie), params[0][0], other_param) echo(1, 'cookie', cookie) return cookie except: generate_cookie()
def load_comment_v2(self, movie_id: int, start: int): ''' load comment by proxy''' url = self.COMMENT_PROXY_URL % (movie_id, start) self.generate_cookie() comment_json = basic_req(url, 1) if comment_json is None or not 'comments' in comment_json: if not comment_json is None and 'code' in comment_json: if comment_json['code'] == 5000: self.finish_list[(movie_id, start)] = 0 self.checkpoint() else: comment_json['code'] == 112 self.proxy_can_use = False echo(2, url, 'Failed') self.again_list.append([movie_id, start]) else: self.again_list.append([movie_id, start]) echo(0, url, 'Failed') return comment_html = comment_json['comments'] comment = {(movie_id, ii['author']['id']): [ii['author']['name'], ii['author']['id'], ii['created_at'], ii['content'], '', ii['rating']['value']] for ii in comment_html} user_list = {ii['author']['id'] for ii in comment_html} self.user_info = {*self.user_info, *user_list} self.comment = {**self.comment, **comment} if len(user_list) == 100: self.more_user.append([movie_id, start + 100]) self.finish_list[(movie_id, start)] = 0 self.finish_list[(movie_id, start + 20)] = 0 self.finish_list[(movie_id, start + 40)] = 0 self.finish_list[(movie_id, start + 60)] = 0 self.finish_list[(movie_id, start + 80)] = 0 self.checkpoint()
def search_goods_once(self, goods_name, index): if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() url_list = [ 'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=', cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=', str(int(round(time.time() * 1000))), '&_t=', str(int(round(time.time() * 1000))), '&q=', goods_name ] headers = { 'X-Requested-With': 'XMLHttpRequest', 'Cookie': '', 'Content-Type': get_content_type(), 'Accept': get_accept('xhr'), } headers['Cookie'] = cookie[0][:-1] ca = basic_req(''.join(url_list), 2, header=headers) if ca.status_code != 200 or not 'data' in ca.json(): if can_retry(''.join(url_list)): self.search_goods_once(goods_name, index) return page_list = ca.json()['data']['pageList'] title = [ '||'.join( [str(index['auctionId']), goods_name, str(index['zkPrice'])]) for index in page_list ][0] self.goods_name[index] = title print(title)
def bulk_import_alimama_once(self, index, group_id): """ bulk import alimama """ url = 'http://pub.alimama.com/favorites/item/batchAdd.json' if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() goods_len = len(self.goods_candidate) begin_id = index * 200 end_id = min(goods_len, (index + 1) * 200) goods_ids = self.goods_candidate[begin_id:end_id] update_data = { 'groupId': group_id, 'itemListStr': ','.join(goods_ids), 't': str(int(round(time.time() * 1000))), '_tb_token_': cookie[1][:-1], 'pvid': cookie[2][:-1] } print(update_data) cb = basic_req(url, 12, data=update_data, header=headers) if cb.status_code == 200 and cb.json()['info']['message'] != 'nologin': print(cb.json()['data'])
def get_cid(self, bv_id: str): playlist_url = self.PLAYLIST_URL % bv_id headers = {"Accept": "*/*", "Referer": self.ROOM_INIT_URL % bv_id} req = basic_req(playlist_url, 1, header=headers) if req is None or list(req.keys()) != self.JSON_KEYS: return cid = [ii["cid"] for ii in req["data"]] return cid
def get_download(self, types: str): url = "https://www.proxy-list.download/api/v0/get?l=en&t=" + types tt = basic_req(url, 1) if tt is None: return [] tt_list = tt[0]["LISTA"] echo(1, "Get download", types, len(tt_list)) return ["{}:{}".format(ii["IP"], ii["PORT"]) for ii in tt_list]
def get_free_proxy(self, url: str): req = basic_req(url, 2) if req is None: return [] tt = req.text t_list = re.findall("<tr><td>(\d*\.\d*\.\d*\.\d*)</td><td>(\d*?)</td>", tt) echo(1, "Get Free proxy List", url, len(t_list)) return ["{}:{}".format(ii, jj) for ii, jj in t_list]
def get_request_v3(self, url, types): result = basic_req(url, 0) if result is None or not result or not len(result.find_all('p', class_='content')): if can_retry(url): self.get_request_v3(url, types) return return result
def get_request(self, url, types): result = basic_req(url, 1) if not result: if can_retry(url): self.get_request(url, types) return return result
def get_ynote_file(self, offset: int = 0): url = self.LISTRECENT_URL % (offset, self.cstk) data = {"cstk": self.cstk} req = basic_req(url, 11, data=data, header=self.get_ynote_web_header(1)) if req is None or type(req) != list: return None list_recent = {ii["fileEntry"]["id"]: ii["fileEntry"] for ii in req} self.list_recent = {**self.list_recent, **list_recent} echo(1, "Load ynote file {} items.".format(len(self.list_recent))) return req
def basic_view(self, url: str, times: int, types: int): ''' press have no data input ''' url = self.AV_URL if types == 1: html = proxy_req(url, 0) else: html = basic_req(url, 0) if html == False and times < 5: self.basic_view(url, times + 1, types)
def get_share_info(self, share_id: str): changeJsonTimeout(4) url = self.GET_SHARE_URL % share_id headers = self.get_tb_headers(self.Y_URL) req = basic_req(url, 1, header=headers) if req is None: return info = req["entry"] self.share2article[share_id] = (info["name"].replace('.note', ''), info["id"], info["lastUpdateTime"]) return req
def user_action(self, hotel_id: int = 4889292): url = '{}hotel/{}.html'.format(HOTELS_URL, hotel_id) text = basic_req(url, 3) page_id = int(re.findall(r'id="page_id" value="(\d*?)" />', text)[0]) correlation_id = re.findall(r'relationId" value="(\d*?)"/>', text)[0] e = self.login_cookie()['_bfa'].split('.') common = [ page_id, e[1] + '.' + e[2], int(e[6]), int(e[7]), correlation_id, "M:70,181023_hod_fxtj:B;", '', '2.6.9', "vq5tkk-ufpyck-qsxbg3", "", "", "", "", "", "online" ] _queue = [{ 'action': 'click', 'xpath': "HTML/BODY[@id='mainbody']/FORM[@id='aspnetForm']/DIV[3][@id='base_bd']/DIV[4]/DIV[@id='divDetailMain']/DIV[9][@id='id_room_select_box']/DIV[2]/DIV/DIV/A[@id='changeBtn'][@x='{}'][@y='{}'][@rx='{}'][@ry='{}']" .format(random.randint(50, 80), random.randint(650, 750), random.randint(20, 40), random.randint(5, 20)), 'ts': int(time.time() * 1000), }] ee = [[2, "useraction"], common, _queue] eee = json.dumps(ee, separators=(',', ':')) print(eee) compress = execjs.compile(open(compress_path).read()) eeee = compress.call('compress', eee) echo(2, eeee) cookie = {'uid': 'Yn17vOkRm2gW+jCNwT8jPg=='} header = { 'Referer': 'https://hotels.ctrip.com/hotel/4889292.html', 'Cookie': self.encoder_cookie(cookie), 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3777.0 Safari/537.36', } url = 'https://s.c-ctrip.com/bf.gif?ac=a&d={}&jv=1.0.0'.format(eeee) req = basic_req(url, 2, header=header) echo(0, req.cookies.get_dict())
def decoder_tpwd(self, tpwd: str): """ decoder the tpwd from taokouling """ url = self.DECODER_TPWD_URL % (self.api_key, tpwd) req = basic_req(url, 1) if ( req is None or isinstance(req, str) or 'ret' not in list(req.keys()) ): return {} return req
def basic_press(self, url, times, types): """ press have no data input """ url = url + str(int(round(time.time() * 1000))) if types == 1: html = proxy_req(url, 1) else: html = basic_req(url, 1) if html == False and times < 5: self.basic_press(url, times + 1, types)
def get_api(self): API_KEY = "xxx" url = "http://api.scraperapi.com/?api_key={}&url=http://httpbin.org/ip".format( API_KEY) t_list = [] for ii in range(38): tt = basic_req(url, 1) if tt is None: continue t_list.append(tt["origin"]) echo(1, "Get scraperapi", len(t_list)) return t_list
def get_goods_second(self, url, index): second_result = basic_req(url, 0, header=self.headers) # second_result = proxy_req(url, 0) if not second_result or not len(second_result.find_all('input')): if can_retry(url): self.get_goods_second(url, index) return goods_id = second_result.find_all('input')[6]['value'] print(goods_id) self.goods_map[index] = goods_id
def request_text(self, url): ''' requests text ''' req = basic_req(url, 2) if req is None: echo(0, url) if can_retry(url): self.request_text(url) else: return '' else: echo(1, url) return req.text
def load_city_list(self): ''' load city list ''' text = basic_req(self.MDD_URL, 3) city_list = re.findall( '/travel-scenic-spot/mafengwo/(.*?).html" target="_blank">(.*?)(</a>|<span)', text) id2map = { int(ii[0]): ii[1].strip() for ii in city_list if ii[0].isdigit() } city_list = id2map.keys() self.city_list = city_list self.id2map = id2map
def judge_url(self, urls: str, index: int, times: int, ss_test: bool = False): """ use /api/playlist to judge http; use /discover/playlist judge https 1. don't timeout = 5 2. response.result.tracks.size() != 1 """ http_type = urls[4] == "s" proxies = {type_map[http_type]: urls} test_url = (type_map[http_type] + "://music.163.com/api/playlist/detail?id=432853362") ss_url = "https://www.google.com/?gws_rd=ssl" try: data = basic_req(test_url, 1, proxies) result = data["result"] tracks = result["tracks"] if len(tracks) == 10: if times < 0: self.judge_url(urls, index, times + 1) else: echo("1|debug", urls, proxies, "Proxies can use.") self.canuse_proxies.append(urls) self.can_use_ip[index] = [urls, int(http_type)] if ss_test: data = basic_req(ss_url, 0) if len(str(data)) > 5000: self.can_use_ip[index] = [urls, int(http_type) + 2] else: echo("0|debug", urls, proxies, "Tracks len error ^--<^>--^ ") self.cannot_use_ip[index] = urls except: echo("0|debug", urls, proxies, "return error [][][][][][]") if not index in self.can_use_ip: self.cannot_use_ip[index] = urls
def load_av_lists(self): url = self.MEMBER_SUBMIT_URL % self.assign_up_mid json_req = basic_req(url, 1) if json_req is None or not 'data' in json_req or not 'vlist' in json_req[ 'data']: if can_retry(url): self.load_av_lists() return av_id_map = {ii['aid']: ii for ii in json_req['data']['vlist']} if self.basic_av_id not in av_id_map: if can_retry(url): self.load_av_lists() return self.av_id_map = av_id_map