def start(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', 'Cookie': 'SINAGLOBAL=4813403181515.393.1614675647253; UOR=,,login.sina.com.cn; wvr=6; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhlR.k2GdqbjyZzEITOSHRr5JpX5KMhUgL.Foece0.0Sh2cehM2dJLoIp7LxKnL12BL1KzLxK.L1hML12H7i--fi-88i-2E; ALF=1649747641; SSOLoginState=1618211642; SCF=AgcHxrBHHt4UjbLh9mubH40GHYX5wHnmPtTAqB6TkyEecKlr459m1ZsEsdZPkpxPDP11WmTyMyb9vrmOnKIcOo8.; SUB=_2A25Nd4NqDeRhGeVI6FsS9C_KyzuIHXVuBPOirDV8PUNbmtANLXfWkW9NTAPerAbffhFF6sJAbxCl8XyTcJIKscGB; _s_tentry=login.sina.com.cn; Apache=9695619437120.088.1618211644332; ULV=1618211644356:10:3:1:9695619437120.088.1618211644332:1617702472974; wb_view_log_3639341607=1920*10801; webim_unReadCount=%7B%22time%22%3A1618211680769%2C%22dm_pub_total%22%3A2%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A11%2C%22msgbox%22%3A0%7D; WBStorage=8daec78e6a891122|undefined' } # http处理 if 'http:' in str(url): request_url = str(url).replace('http:', 'https:') else: request_url = url try: item = {} response = BaseFunctions.requests().get(request_url, verify=False, headers=headers, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout) html = etree.HTML(response.text) views_str = "".join(html.xpath("//div[@class='W_fr']//text()")) reg = '\d+' views = int("".join(re.findall(reg, views_str))) item['views'] = views item['comments'] = None item['likes'] = None item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): source_url = 'https://comment.sina.com.cn/page/info?newsid={id}&channel=mp' try: item = {} id = "".join(re.findall(r'article_(.*).html', url)).replace('_', '-') response = BaseFunctions.requests().get(source_url.format(id=id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = str(data['result']['count']['total']) item['comments'] = comments item['likes'] = None item['views'] = None item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): source_url = 'https://www.dongchedi.com/motor/info/ugc/short/article/v1/?group_id={article_id}&from=pc_station' try: item = {} article_id = "".join(re.findall(r'article/(.*)', url)) response = BaseFunctions.requests().get( source_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = int(data['data']['comment_count']) views = int(data['data']['read_count']) likes = int(data['data']['digg_count']) forwards = int(data['data']['share_count']) item['url'] = url item['forwards'] = forwards item['comments'] = comments item['likes'] = likes item['views'] = views BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): interface = 'https://c.mp.qq.com/cgi-bin/comment/Aggregation' headers = {'Content-Type': 'application/x-www-form-urlencoded'} try: item = {} reg = r'-(.*?).html' article_id = "".join(re.findall(reg, url)) form_data = {'article_id': f'{article_id}', 'cmd[]': 'articleInfo'} response = BaseFunctions.requests().post(interface, headers=headers, data=form_data, proxies=DefaultValues.proxies) data = json.loads(response.text) item['comments'] = int( data['data']['articleInfo']['data']['comment_count']) item['like_count'] = int( data['data']['articleInfo']['data']['like_count']) item['views'] = None item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def bbVideo(url): source_url = 'https://api.bilibili.com/x/web-interface/view/detail?aid={aid}&bvid={bvid}' try: item = {} reg = r'/video/(.*)' id = "".join(re.findall(reg, url)) if 'av' in id: request_url = source_url.format(aid=id.replace('av', ''), bvid='') else: request_url = source_url.format(aid='', bvid=id) response = BaseFunctions.requests().get(request_url, verify=False, proxies=DefaultValues.proxies) data = json.loads(response.text) views = int(data['data']['View']['stat']['view']) comments = int(data['data']['View']['stat']['reply']) likes = int(data['data']['View']['stat']['like']) forwards = int(data['data']['View']['stat']['share']) item['views'] = views item['comments'] = comments item['likes'] = likes item['forwards'] = forwards item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def msouhu(url): source_url = 'https://api.interaction.sohu.com/api/comments/maincomments?source_id=mp_{id}&reply_count=1&page_size=1&type=0&page_no=1' try: item = {} reg = r'www.sohu.com/.*/(.*?)_' id = "".join(re.findall(reg, url)) response = BaseFunctions.requests().get(source_url.format(id=id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = int(data['data']['totalCount']) item['comments'] = comments item['likes'] = None item['views'] = None item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def souhu3g(url): source_url = 'https://3g.k.sohu.com/api/comment/getCommentListByCursor.go?id={id}&busiCode=2' try: item = {} reg = r'/n(.*)' id = "".join(re.findall(reg, url)) response = BaseFunctions.requests().get(source_url.format(id=id), timeout=DefaultValues.timeout, verify=False, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = int(data['response']['totalCount']) item['comments'] = comments item['likes'] = None item['views'] = None item['forwards'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): source_url = 'https://kuaibao.qq.com/getSubNewsContent?id={article_id}' item = {} try: article_id = "".join( re.findall( r's/(.*)', str(url).replace("".join(re.findall(r'(\?.*)', url)), ''))) response = BaseFunctions.requests().get( source_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = data['count_info']['comments'] likes = data['count_info']['like_info'] item['url'] = url item['comments'] = comments item['likes'] = likes item['forwards'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def judge_url(url): try: if 'weibo' in str(url): weibo.start(url) elif '360kuai' in url or 'm.news.so.com' in url: kuaizixun.start(url) elif 'www.163.com' in url or 'dy.163.com' in url or '3g.163.com' in url or 'dy.163.com' in url: wangyihao.wangyi(url) elif 'cn.club.vmall.com' in url or 'club.huawei.com' in url: huafen.vmall(url) elif 'baidu.com' in url and 'tieba' not in url: baijiahao.start(url) elif 'www.toutiao.com' in url or 'm.toutiaocdn.net' in url or 'toutiao.com' in url: toutiao.toutiao(url) elif 'yidianzixun' in url: yidianzixun.yidianzixun(url) elif 'tieba.baidu.com' in url: tieba.start(url) elif 'a.mp.uc.cn' in url or 'm.uczzd.cn' in url or 'iflow.uc.cn' in url: dayuhao.start(url) elif 'kuaibao.qq.com' in url: kandiankuaibao.start(url) elif 'new.qq.com' in url: qiehao.start(url) elif 'www.zhihu.com' in url: zhihu.start(url) elif 'dongchedi' in url: dongchedi.start(url) elif 'm.sohu.com' in url: shoujisouhu.start(url) elif 'www.sohu.com' in url or '3g.k.sohu' in url: souhuhao.start(url) elif 'ttarticle' in url: weibotoutiao.start(url) elif 'k.sina.com' in url: xinlangkandian.start(url) elif 'bilibili' in url: bilibili.start(url) elif 'haokan' in url: haokanshipin.start(url) elif 'post.mp.qq.com' in url: qqkandian.start(url) else: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path) except: traceback.print_exc()
def newspage_dt(url): try: item = {} response = BaseFunctions.requests().get(url, verify=False, proxies=DefaultValues.proxies) reg = r'window.jsonData\s*=\s*({.*?});' data = json.loads("".join(re.findall(reg, response.text))) item['views'] = int(data['data']['pageInfo']['interaction_data']['readNum']['count']) item['likes'] = int(data['data']['pageInfo']['interaction_data']['praise']['praise_num']) item['forwards'] = int(data['data']['pageInfo']['interaction_data']['forwardNum']) item['comments'] = int(data['data']['pageInfo']['interaction_data']['commentNum']) item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def yidianzixun(url): source_url = 'http://www.yidianzixun.com/home/q/getcomments?docid={article_id}&count=30' article_id = "".join(re.findall('article/(.*)', str(url).replace("".join(re.findall('(\?.*)', url)), ''))).replace('/', '') item = {} try: response = BaseFunctions.requests().get(source_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) comments = int(json.loads(response.text)['total']) item['url'] = url item['forwards'] = None item['comments'] = comments item['likes'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): try: item = {} response = BaseFunctions.requests().get( url, verify=False, proxies=DefaultValues.proxies).content.decode('unicode-escape') reg = r'window.__PRELOADED_STATE__\s*=\s*({.*})\s*;' data = json.loads("".join(re.findall(reg, str(response)))) item['views'] = int(data['curVideoMeta']['playcnt']) item['comments'] = int(data['curVideoMeta']['fmcomment_num']) item['likes'] = int(data['curVideoMeta']['fmlike_num']) item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): item = {} comment_source_url = 'https://coral.qq.com/article/{target_id}/commentnum' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36' } try: if 'rain' in url: request_url = url elif 'cmsid' in url: request_url = 'https://new.qq.com/rain/a/' + "".join( re.findall(r'cmsid=(.*)', url)) elif 'omn' in url: response_url = BaseFunctions.requests().get( url, verify=False, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout).url if 'notfound.htm' in response_url: end_str = re.match(r'http(.*)/(.*)', url).group(2) article_id = end_str.replace( "".join(re.findall(r'(\..*)', end_str)), '') request_url = 'https://new.qq.com/rain/a/' + article_id else: request_url = url else: pass # 获取target_id response = BaseFunctions.requests().get(request_url, verify=False, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout) target_id = "".join(re.findall(r'comment_id": "(.*)",', response.text)) # 通过target_id构造请求url获取评论数 comment_response = BaseFunctions.requests().get( comment_source_url.format(target_id=target_id), proxies=DefaultValues.proxies, verify=False, timeout=DefaultValues.timeout, headers=headers) comments = json.loads(comment_response.text)['data']['commentnum'] item['url'] = url item['comments'] = comments item['forwards'] = None item['likes'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def mnews(url): item = {} source_interface = 'https://u.api.look.360.cn/comment/lists?url={param_url}' param_url = "".join(re.findall(r'url=(.*)', url)) try: response = BaseFunctions.requests().get(source_interface.format(param_url=param_url), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = int(data['data']['total']) item['url'] = url item['comments'] = comments item['likes'] = None item['forwards'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def kuai360(url): source_url = 'https://www.360kuai.com/user/comment/lists?f=jsonp&page_key={page_key}' try: item = {} page_key = "".join(re.findall(r'.com/(.*)', url)) response = BaseFunctions.requests().get(source_url.format(page_key=page_key), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(response.text) comments = int(data['data']['total']) item['url'] = url item['comments'] = comments item['likes'] = None item['forwards'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def knowpage(url): source_url = 'https://mbd.baidu.com/knowpage/api/getdynamicinfo?crid={crid}&qid={qid}' try: item = {} reg = 'qid=(.*)' qid = "".join(re.findall(reg, url)) response = BaseFunctions.requests().get(url, verify=False, proxies=DefaultValues.proxies) html = etree.HTML(response.text) crid = "".join(html.xpath("//li[@class='reply-item tpl-reply-item'][1]/@data-rid")) request_url = source_url.format(crid=crid, qid=qid) response1 = BaseFunctions.requests().get(request_url, verify=False, proxies=DefaultValues.proxies) data = json.loads(response1.text) likes = int(data['data']['replies']['list'][0]['thumbUp']) comments = int(data['data']['replies']['list'][0]['commentCount']) item['comments'] = comments item['likes'] = likes item['views'] = None item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def bbRead(url): source_url = 'https://api.bilibili.com/x/article/viewinfo?id={id}' try: item = {} reg = r'read/cv(.*)' id = "".join(re.findall(reg, url)) request_url = source_url.format(id=id) response = BaseFunctions.requests().get(request_url, verify=False, proxies=DefaultValues.proxies) data = json.loads(response.text) item['views'] = int(data['data']['stats']['view']) item['likes'] = int(data['data']['stats']['like']) item['comments'] = int(data['data']['stats']['reply']) item['forwards'] = int(data['data']['stats']['share']) item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def baijiahao(url): try: item = {} source_url = 'https://mbd.baidu.com/webpage?type=homepage&action=interact&format=jsonp¶ms=[{"feed_id":"%s","thread_id":"%s","dynamic_type":"2","dynamic_sub_type":"2001"}]&uk=%s' ids = get_id(url) request_url = source_url % (ids[0], ids[1], ids[2]) response = BaseFunctions.requests().get(request_url, verify=False, proxies=DefaultValues.proxies, headers=headers) json_str = "".join(re.findall(r'callback\((.*)\)', response.text)) data = json.loads(json_str) item['likes'] = int(data['data']['user_list']['_2001_']['praise_num']) item['comments'] = int(data['data']['user_list']['_2001_']['comment_num']) item['views'] = int(data['data']['user_list']['_2001_']['read_num']) item['forwards'] = None item['url'] = url BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def vmall(url): item = {} try: response = BaseFunctions.requests().get(url, verify=False, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout) html = etree.HTML(response.text) views = int(html.xpath("//span[@title='查看']/text()")[0]) comments = int(html.xpath("//span[@title='回复']/text()")[0]) item['url'] = url item['views'] = views item['comments'] = comments item['forwards'] = None item['likes'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def wangyi(url): article_id = "".join(re.findall(r'.*/(.*).html', url)) source_url = 'https://comment.api.163.com/api/v1/products/a2869674571f77b5a0867c3d71db5856/threads/{article_id}' try: response = BaseFunctions.requests().get(source_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) likes = int(json.loads(response.text)['cmtCount']) forwards = int(json.loads(response.text)['rcount']) comments = int(json.loads(response.text)['tcount']) item = {} item['url'] = url item['comments'] = comments item['forwards'] = forwards item['likes'] = likes item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): item = {} try: response = BaseFunctions.requests().get(url, verify=False, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout) html = etree.HTML(response.text) comments = int("".join( html.xpath( "//*[@id='thread_theme_5']//span[@style='margin-right:3px']/text()" ))) item['url'] = url item['comments'] = comments item['forwards'] = None item['likes'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def get_id(url): try: response = BaseFunctions.requests().get(url, verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) reg = r'window.jsonData\s*=\s*({.*?});' data = json.loads("".join(re.findall(reg, response.text))) feed_id = data['bsData']['superlanding'][0]['itemData']['notice']['id'] thread_id = data['bsData']['comment']['tid'] uk = data['bsData']['profitLog']['contentAccId'] return (feed_id, thread_id, uk) except: pass
def toutiao(url): source_url = 'https://www.toutiao.com/article/v2/tab_comments/?group_id={article_id}' article_id = "".join(re.findall(r'\d+', url)) try: response = BaseFunctions.requests().get( source_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) comments = int(json.loads(response.text)['total_number']) item = {} item['url'] = url item['forwards'] = None item['comments'] = comments item['likes'] = None item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): source_get_itemId_url = 'https://ff.dayu.com/contents/{article_id}?biz_id=1002' source_comment_url = 'http://m.uczzd.cn/iflow/api/v2/cmt/article/{item_id}/comments/byhot' item = {} try: if 'm.uczzd.cn' in url or 'iflow.uc.cn' in url: item_id = "".join(re.findall(r'aid=(.*)', url)) elif 'a.mp.uc.cn' in url: article_id = "".join(re.findall(r'cid=(.*)', url)) item_response = BaseFunctions.requests().get( source_get_itemId_url.format(article_id=article_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) item_id = json.loads( item_response.text)['data']['_extra']['xss_item_id'] else: pass comment_response = BaseFunctions.requests().get( source_comment_url.format(item_id=item_id), verify=False, timeout=DefaultValues.timeout, proxies=DefaultValues.proxies) data = json.loads(comment_response.text) comments = data['data']['comment_cnt'] likes = data['data']['like_cnt'] item['url'] = url item['comments'] = comments item['forwards'] = None item['likes'] = likes item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)
def start(url): headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36', 'Cookie': 'SINAGLOBAL=4813403181515.393.1614675647253; UOR=,,www.baidu.com; SUB=_2AkMXGS0Pf8NxqwJRmfsSz2PiZY9wwwHEieKhRdzUJRMxHRl-yT9kql4CtRB6PJkD4DyZKKRvisLn0T3XT1mmPjgYMP-T; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9W5MlfLdcaj5Hs24b9hrEZu4; login_sid_t=bb8b362aae60bcb8824d79fd69224aaf; cross_origin_proto=SSL; _s_tentry=-; Apache=4623102832203.645.1615254315211; ULV=1615254315215:3:3:2:4623102832203.645.1615254315211:1615176251737; WBtopGlobal_register_version=2021031011; wb_view_log=1440*9002%261920*10801; WBStorage=202103101503|undefined' } # http处理 if 'http:' in str(url): request_url = str(url).replace('http:', 'https:') else: request_url = url try: response = BaseFunctions.requests().get(request_url, verify=False, headers=headers, proxies=DefaultValues.proxies, timeout=DefaultValues.timeout) detail = re.search( r'{(.*?)"domid":"Pl_Official_WeiboDetail__73"(.*?)}\)', response.text) if detail: item = {} data = json.loads(detail.group().replace('})', '}')) html = etree.HTML(data['html'], parser=etree.HTMLParser(encoding='utf-8')) forwards = int( html.xpath("//span[@node-type='forward_btn_text']//text()")[1] ) if html.xpath("//span[@node-type='forward_btn_text']//text()" )[1] != '转发' else 0 comments = int( html.xpath("//span[@node-type='comment_btn_text']//text()")[1] ) if html.xpath("//span[@node-type='comment_btn_text']//text()" )[1] != '评论' else 0 likes = int( html.xpath( "//div[@node-type='feed_list_options']//span[@node-type='like_status']//text()" )[1] ) if html.xpath( "//div[@node-type='feed_list_options']//span[@node-type='like_status']//text()" )[1] != '赞' else 0 item['url'] = url item['forwards'] = forwards item['comments'] = comments item['likes'] = likes item['views'] = None BaseFunctions.writeFile(item, DefaultValues.item_path) else: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path) except: BaseFunctions.writeFalseUrl(url, DefaultValues.false_path)