def get_song_detail(self, id): """ get song detail form playlist """ host = 'http://music.163.com/api/playlist/detail?id=' + str(id) json = proxy_req(host, 1) if json == 0: if can_retry(host): self.get_song_detail(id) return [] result = json['result'] tracks = result['tracks'] if len(tracks) <= 1: if can_retry(host): self.get_song_detail(id) return [] else: playcount = result['playCount'] for track in tracks: songid = track['id'] songname = track['name'] self.songlist.append([songid, songname, playcount]) self.finishlist.append(id)
def summarization_once(self, index): """ get html from news """ print(index) texts = [] if index: url = 'https://www.baidu.com/s?ie=utf-8&mod=1&isbd=1&isid=919fab3c0002c9f1&wd=%E5%81%B7%E7%8B%97&pn=730&oq=%E5%81%B7%E7%8B%97&tn=baiduhome_pg&ie=utf-8&rsv_idx=2&rsv_pq=919fab3c0002c9f1&rsv_t=7e30ggF%2BMa91oOURk1bMtN8af5unSwOR08TodNBB%2F%2B6B6RBEwUi8l8IAe28ACA%2B8b5I5&gpc=stf%3D1517038564%2C1548574564%7Cstftype%3D1&tfflag=1&bs=%E5%81%B7%E7%8B%97&rsv_sid=undefined&_ss=1&clist=28bc21fb856a58b7%09350102124f079888%0928bc21fb856a58b7%0928bc2159845c1cf3%0928bc2015823fa56b%0928a121fb84a7d1a6&hsug=&f4s=1&csor=2&_cr1=34767&pn=' + \ str(index * 20) else: url = 'http://news.baidu.com/ns?rn=20&ie=utf-8&cl=2&ct=1&bs=%E6%AF%92%E7%8B%97%E8%82%89&rsv_bp=1&sr=0&f=8&prevct=no&tn=news&word=%E5%81%B7%E7%8B%97' news_lists = proxy_req(url, 0) if not news_lists: if can_retry(url): self.summarization_once(index) return summarization_lists = news_lists.find_all('div', class_='result') if not len(summarization_lists): if can_retry(url): self.summarization_once(index) return print('num: ', len(summarization_lists), url) for summarization in summarization_lists: temp_text = summarization.text.replace('\n', '').replace( '\xa0', '').replace('\t', '').strip() temp_text = ' '.join(temp_text.split()) texts.append(temp_text[:-8]) self.summarizations[int(index)] = texts
def load_av_lists(self): url = self.MEMBER_SUBMIT_URL % self.assign_up_mid json_req = basic_req(url, 1) if json_req is None or not 'data' in json_req or not 'vlist' in json_req[ 'data']: if can_retry(url): self.load_av_lists() return av_id_map = {ii['aid']: ii for ii in json_req['data']['vlist']} if self.basic_av_id not in av_id_map: if can_retry(url): self.load_av_lists() return self.av_id_map = av_id_map
def _getroom_id(self, next_to=True, proxy=True): ''' get av room id ''' url = self.ROOM_INIT_URL % self._av_id html = proxy_req(url, 0) if proxy else basic_req(url, 0) head = html.find_all('head') if not len(head) or len( head[0].find_all('script')) < 4 or not '{' in head[0].find_all( 'script')[3].text: if can_retry(url): self._getroom_id(proxy=proxy) else: self._getroom_id(proxy=False) next_to = False if next_to: script_list = head[0].find_all('script')[3].text script_begin = script_list.index('{') script_end = script_list.index(';') script_data = script_list[script_begin:script_end] json_data = json.loads(script_data) if self._p == -1 or len(json_data['videoData']['pages']) < self._p: self._room_id = json_data['videoData']['cid'] else: self._room_id = json_data['videoData']['pages'][self._p - 1]['cid'] print('Room_id:', self._room_id)
def get_goods_id_first(self, origin_url, index): """ get goods id first """ origin_url = origin_url.replace('https', 'http') # first_result = proxy_req(origin_url, 0) first_result = basic_req(origin_url, 0, header=self.headers) if not first_result or len(first_result.find_all('script')) < 2: if can_retry(origin_url): self.get_goods_id_first(origin_url, index) return wait = first_result.find_all('script')[1].text if not '"title":"' in wait: return title = re.findall('"title":".*","', wait)[0].split('","')[0].split('":"')[1] if title in self.title2map: self.goods_map[index] = self.title2map[title] self.url2goods[origin_url] = self.title2map[title] print(self.title2map[title]) else: print(title)
def search_goods_once(self, goods_name, index): if not os.path.exists('%scookie_alimama' % data_dir): print('alimama cookie not exist!!!') return with codecs.open('%scookie_alimama' % data_dir, 'r', encoding='utf-8') as f: cookie = f.readlines() url_list = [ 'https://pub.alimama.com/items/search.json?auctionTag=&perPageSize=50&shopTag=&_tb_token_=', cookie[1][:-1], '&pvid=', cookie[2][:-1], '&t=', str(int(round(time.time() * 1000))), '&_t=', str(int(round(time.time() * 1000))), '&q=', goods_name ] headers = { 'X-Requested-With': 'XMLHttpRequest', 'Cookie': '', 'Content-Type': get_content_type(), 'Accept': get_accept('xhr'), } headers['Cookie'] = cookie[0][:-1] ca = basic_req(''.join(url_list), 2, header=headers) if ca.status_code != 200 or not 'data' in ca.json(): if can_retry(''.join(url_list)): self.search_goods_once(goods_name, index) return page_list = ca.json()['data']['pageList'] title = [ '||'.join( [str(index['auctionId']), goods_name, str(index['zkPrice'])]) for index in page_list ][0] self.goods_name[index] = title print(title)
def load_index(): ''' load index ''' global movie_list version = begin_time() text = proxy_req(HOMEPAGE_URL, 3) if not len(text): if can_retry(HOMEPAGE_URL): load_index() return movie_list = re.findall('《(.*?)》', text) movie_more = re.findall('href="(.*?)">更多', text) for uri in movie_more: load_other(uri) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_another] shuffle_batch_run_thread(threading_list, 100) threading_list = [threading.Thread( target=load_other, args=(ii,)) for ii in movie_again] shuffle_batch_run_thread(threading_list, 100) # 对电影列表去重 movie_list = set(movie_list) # 导出爬取的 电影列表 out_path = 'dytt8_result.txt' with open(out_path, 'w') as f: f.write('\n'.join(movie_list)) url_num = len([*movie_more, *movie_another]) + 1 movie_num = len(movie_list) echo(1, 'Requests num: {}\nMovie num: {}\nOutput path: {}\nSpend time: {:.2f}s\n'.format( url_num, movie_num, out_path, end_time(version, 0)))
def load_spot_once(self, pn=1, city_id=10186): ''' load spot once ''' data = { 'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 'iMddid': city_id, 'iTagId': 0, 'iPage': pn, } data = self.load_sn(data) print(data) req = proxy_req(self.AJAX_ROUTER_URL, 11, data=data) if req is None or not 'data' in req or not 'list' in req['data']: if can_retry('{}{}{}'.format(self.AJAX_ROUTER_URL, city_id, pn)): self.load_spot_once(pn, city_id) return spot_list = req['data']['list'] spot_pn = req['data']['page'] spot_tmp = re.findall('<h3>.*?(.*?)</h3>', spot_list) try: total_pn = int(re.findall('共<span>(.*?)</span>', spot_pn)[0]) except Exception as e: total_pn = 1 echo(0, 'City id:', city_id, 'Page:', pn, spot_pn, e) if city_id not in self.spot_result: self.spot_result[city_id] = spot_tmp else: self.spot_result[city_id] += spot_tmp self.spot_pn[city_id] = total_pn
def get_movie_list_from_tabs(self, sorts: str, tags: str, genres: str, year_range: str, star: int = 0): ''' get info from movie list ''' params_dict = {'sort': sorts, 'range': '0,10', 'tags': urllib.parse.quote(tags), 'genres': urllib.parse.quote(genres), 'star': star, 'limit': 1000 if star < 9000 else 9999 - star, 'year_range': year_range} params = ['{}={}'.format(ii, jj) for ii, jj in params_dict.items() if jj != ''] url = '{}{}'.format(self.NEW_SEARCH_SUBJECT_URL, '&'.join(params)) self.generate_cookie() movie_req = proxy_req(url, 2) if movie_req is None: if can_retry(url): self.get_movie_list_from_tabs( sorts, tags, genres, year_range, star) else: self.again_list.append([sorts, tags, genres, year_range, star]) echo(0, url, 'Failed') return if movie_req.status_code != 200: return try: movie_json = movie_req.json() echo(2, url, 'loaded') id2name = {int(ii['id']): ii['title'] for ii in movie_json['data']} self.movie_id2name = {**self.movie_id2name, **id2name} except: echo(0, url, 'Except!')
def get_search_list(self, q: str): if self.proxy_can_use: base_url = self.API_PROXY_URL if random.random() * 10 > 7 else self.API_BASIC_URL else: base_url = self.API_BASIC_URL url = '{}search?q={}&count=66'.format(base_url, urllib.parse.quote(q)) search_json = proxy_req(url, 1) if search_json is None or not 'subjects' in search_json: if search_json and 'code' in search_json: if search_json['code'] == 112: self.proxy_can_use = False if can_retry(url, 6): time.sleep(random.random() * (3.14 + random.randint(4, 10)) + 3.14) self.get_search_list(q) else: self.again_list.append(q) echo(0, url, 'Failed') return # echo(2, url, 'loaded') id2name = {int(ii['id']): ii['title'] for ii in search_json['subjects']} self.movie_id2name = {**self.movie_id2name, **id2name} self.finish_list.append(q) if not len(self.finish_list) % 600: echo(2, len(self.finish_list), 'Finish...') dump_bigger(self.movie_id2name, '{}douban_movie_id.pkl'.format(data_dir))
def load_url(self): ''' load url form zimuzu ''' url = 'http://zmz005.com/{}'.format(self.zimuzu_id) detail = proxy_req(url, 0) total = [] if not detail: print('retry') if can_retry(url): self.load_url() return season_list = detail.find_all( 'div', class_='tab-content info-content')[1:] for season in season_list: quality_list = season.find_all('div', class_='tab-pane') url_body = quality_list[1] if 'APP' in quality_list[0]['id'] else quality_list[0] season_id = re.findall(r"\d+\.?\d*", url_body['id'])[0] total.append(season_id) if int(season_id) < 12: url_body = quality_list[1] url_list = url_body.find_all('ul', class_='down-links') url = [index.find_all('div', class_='copy-link')[1]['data-url'] for index in url_list] total.append('\n'.join(url) + '\n') with codecs.open('{}{}'.format(data_dir, self.drama_name), 'w', encoding='utf-8') as f: f.write('\n'.join(total))
def summarization_once(self, index): """ get html from news """ print(index) texts = [] hrefs = [] if index: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=PcVKXJKRIc7s8AXB05e4Dw&sa=N&ved=0ahUKEwjSo5nBvojgAhVONrwKHcHpBfcQ8tMDCFE&biw=1627&bih=427&dpr=2&start=' + \ str(index * 10) else: url = 'https://www.google.com.hk/search?q=%E5%81%B7%E7%8B%97&newwindow=1&safe=strict&tbm=nws&ei=O8VKXJ7nFoP_8QX1oK_gDA&start=0&sa=N&ved=0ahUKEwje8JTAvojgAhWDf7wKHXXQC8w4ChDy0wMISQ&biw=1627&bih=427&dpr=2' news_lists = basic_req(url, 0) href_lists = news_lists.find_all('a', class_=['RTNUJf', 'l lLrAF']) summarization_lists = news_lists.find_all('div', class_='gG0TJc') if not len(href_lists) or not len(summarization_lists): if can_retry(url): self.summarization_once(index) return print('num: ', len(summarization_lists), url) for href in href_lists: hrefs.append(href['href']) for summarization in summarization_lists: temp_text = summarization.text.replace('\n', '').replace( '\xa0', '').replace('\t', '').replace('...', '').strip() temp_text = ' '.join(temp_text.split()) texts.append(temp_text) self.summarizations[int(index)] = texts self.hrefs[int(index)] = hrefs
def get_web_content(self): req = proxy_req(self.WEB_URL, 3, header=self.get_ynote_web_header()) if len(req) < 1000: if can_retry(self.WEB_URL): return self.get_web_content() else: return return req
def get_a_m_basic(self, a_m_url: str): headers = self.get_tb_headers(a_m_url) req = proxy_req(a_m_url, 2, header=headers, config={"allow_redirects": False}) if req is None or "location" not in req.headers: if can_retry(a_m_url): return self.get_a_m_basic(a_m_url) return return req
def get_api_req(self, url: str, av_id: int): req = self.proxy_req(url, 1, header=self.get_api_headers(av_id)) if req is None or list(req.keys()) != self.JSON_KEYS: if can_retry(url): return self.get_api_req(url, av_id) else: return return req["data"]
def get_s_click_detail(self, redirect_url: str, tu_url: str): headers = self.get_tb_headers(refer_url=tu_url) req = proxy_req(redirect_url, 2, header=headers) if req is None or "id=" not in req.url: if can_retry(redirect_url): return self.get_s_click_detail(redirect_url, tu_url) else: return return self.get_item_detail(req.url)
def get_request(self, url, types): result = basic_req(url, 1) if not result: if can_retry(url): self.get_request(url, types) return return result
def get_request_v2(self, url, types, header): result = proxy_req(url, 0, header=header) if not result or not len(result.find_all('div', class_='content')): if can_retry(url): self.get_request_v2(url, types, header) return return result
def get_request_v3(self, url, types): result = basic_req(url, 0) if result is None or not result or not len(result.find_all('p', class_='content')): if can_retry(url): self.get_request_v3(url, types) return return result
def share_article(self, article_id: str): p = self.share2article[article_id][-2].split("/")[-1] url = self.MYSHARE_URL % (p, self.cstk) req = proxy_req(url, 1, header=self.get_ynote_web_header(1)) if req is None or list(req.keys()) != ["entry", "meta"]: if can_retry(url): return self.share_article(article_id) return False echo("2", "Share article {} Success!!!".format(article_id)) return True
def get_captcha(self, cookie: dict = {}): url = self.CAPTCHA_URL headers = self.get_login_headers(0, cookie) captcha, cookies = proxy_req(url, 1, header=headers, need_cookie=True) if captcha is None or list(captcha.keys()) != ['code', 'data']: if can_retry(url): return self.get_captcha() else: return None, {} return captcha['data']['result'], cookies
def get_item_basic(self, item_id: int, url: str = ""): url = self.ITEM_URL % item_id if url == "" else url headers = {"Accept": get_accept("html")} req = proxy_req(url, 2, header=headers, config={"allow_redirects": False}) if req is None: if can_retry(url): return self.get_item_basic(item_id, url) return if req.status_code != 200: return self.get_item_basic(item_id, req.headers["Location"]) return req
def get_request(self, url: str, types: int, functs, header: dict = {}): if len(header): req = basic_req(url, types, header=header) else: req = basic_req(url, types) if functs(req): if can_retry(url): self.get_request(url, types, functs, header) return return req
def href_once(self, index): """ get html from news """ print(index) texts = [] url = 'https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=毒狗肉&pn=' + \ str(index * 10) news_lists = proxy_req(url, 0) if not news_lists: if can_retry(url): self.href_once(index) return test = news_lists.find_all('div', class_='result') if not len(test): if can_retry(url): self.href_once(index) return href_list = [index.a['href'] for index in test] self.href_map[int(index)] = href_list
def get_cid(self, av_id: int): playlist_url = self.PLAYLIST_URL % av_id headers = {'Accept': '*/*', 'Referer': self.ROOM_INIT_URL % av_id} req = proxy_req(playlist_url, 1, header=headers) if req is None or list(req.keys()) != self.JSON_KEYS: if can_retry(playlist_url): return self.get_cid(av_id) else: return cid = [ii['cid'] for ii in req['data']] return cid
def check_type_req(self, av_id: int): changeHeaders({'Referer': self.BASIC_AV_URL % av_id}) url = self.VIEW_URL % av_id json_req = proxy_req(url, 1) if json_req is None or 'data' not in json_req or 'tid' not in json_req[ 'data']: if can_retry(url): self.check_type_req(av_id) return self.rank_type[av_id] = json_req['data']['tid'] == self.assign_tid
def load_comment_v1(self, movie_id: int, start: int): ''' load comment ''' url = self.COMMENT_URL % (movie_id, start) self.generate_cookie() comment_json = proxy_req(url, 1) if comment_json is None or not 'html' in comment_json: if can_retry(url): time.sleep(random.random() * random.randint(0, 4)) self.load_user_id(movie_id, start) else: self.again_list.append([movie_id, start]) echo(0, url, 'Failed') return comment_html = comment_json['html'] # comment_bs4 = BeautifulSoup(comment_html, 'html.parser') # comment = {} # for ii in comment_bs4.find_all('div', class_='comment-item'): # user_id = ii.a['href'].split('/')[-2] # user_name = ii.a['title'] # votes = ii.find_all('span', class_='votes') # votes = votes[0].text if len(votes) else '' # comment_time = ii.find_all( # 'span', class_='comment-time')[0]['title'] # rate = ii.find_all('span', class_='rating') # rate = rate[0]['class'][0].split('allstar')[1] if len(rate) else '' # short = ii.find_all('span', class_='short') # short = short[0] if len(short) else '' # comment[user_id] = [user_name, user_id, # comment_time, short, votes, rate] # user_list = set(comment) user_list = re.findall( r'title="(.*?)" href="https://www.douban.com/people/([\s\S]{1,30}?)/"\>', comment_html) if not len(user_list): self.finish_list[(movie_id, start)] = 0 self.checkpoint() return votes = re.findall(r'votes"\>(\w{1,7}?)<', comment_html) comment_time = re.findall(r'-time " title="(.*?)">\n', comment_html) short = re.findall(r'class="short">([\s\S]*?)</span>', comment_html) rate = re.findall('allstar(\w{1,2}?) rat', comment_html) if len(user_list) != len(comment_time) or len(user_list) != len(short): echo(0, url, 'Comment reg error!!!') comment = {jj[1]: [jj[0], jj[1], comment_time[ii], short[ii] if ii < len(short) else '', votes[ii] if ii < len(votes) else '', rate[ii] if ii < len(rate) else ''] for ii, jj in enumerate(user_list)} user_list = {ii[1] for ii in user_list} self.user_info = {*self.user_info, *user_list} self.comment[movie_id] = {**self.comment[movie_id], **comment} if len(user_list) == 20 and (not (start + 20) % 100 or start < 100): self.more_user.append([movie_id, start + 20]) self.finish_list[(movie_id, start)] = 0 self.checkpoint()
def get_api_req(self, url: str, bv_id: str, types: int = 0): if types == 0: req = self.proxy_req(url, 1, header=self.get_api_headers(bv_id)) else: req = self.proxy_req(url, 3, header=self.get_api_headers(bv_id)) req = self.decoder_jp(req) if req is None or list(req.keys()) != self.JSON_KEYS: if can_retry(url): return self.get_api_req(url, bv_id, types) else: return return req["data"]
def request_text(self, url): ''' requests text ''' req = basic_req(url, 2) if req is None: echo(0, url) if can_retry(url): self.request_text(url) else: return '' else: echo(1, url) return req.text
def get_danmaku_once(self, oid: int): dm_url = self.DM_URL % oid req = proxy_req(dm_url, 2) if req is None: if can_retry(dm_url): return self.get_danmaku_once(oid) else: return req.encoding = "utf-8" dm = regex.findall('p="(.*?)">(.*?)</d>', req.text) echo(3, "oid {} have {} dm".format(oid, len(dm))) return dm, oid