def get_HtmlText(keyword): ''' 获得网页内容 :param keyword: 产品关键字 :return: 网页内容 ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400', 'referer': 'https://www.jd.com/2019', } url = 'https://search.jd.com/search?' for i in range(1, 10): proxies = get_proxies() param = { 'keyword': keyword, 'enc': 'utf-8', 'page': i, } try: r = requests.get(url, params=param, headers=headers, proxies=proxies) r.raise_for_status() r.encoding = 'utf-8' yield r.text except Exception as e: print(e)
def getHtmlText(url, times=5): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400', 'Referer': "www.baidu.com", 'Connection': 'close', } cookies = { 'cookie': 'id=220793df29bd00a4||t=1554689493|et=730|cs=002213fd48ec54ab198b47c57e', } proxies = get_proxies() try: s = requests.session() s.keep_alive = False r = requests.get(url, headers=headers, proxies=proxies, cookies=cookies) r.raise_for_status() r.encoding = 'gb2312' return r.text except Exception as e: if times > 0: print('重试....') time.sleep(3) return getHtmlText(url, times=times - 1) else: print(e)
def getHtmlText(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400', 'Referer': 'https://search.bilibili.com/', } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, timeout=10) r.raise_for_status() return r.text except Exception as e: print(e)
def getHtmlText(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400', 'referer': 'https://music.163.com/' } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, timeout=5) r.raise_for_status() return r.json() except Exception as e: print(e)
def getHtmlText(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400', 'Referer': 'https://maoyan.com/board' } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, timeout=3) r.raise_for_status() return r.text except Exception as e: print(e)
def getHtmlText(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400', } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies) r.raise_for_status() r.encoding = r.apparent_encoding return r.content except Exception as e: print(e)
def getHtmlText(url): headers={ 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3505.400', } cookies={ 'cookie':'_ga=GA1.2.1409714415.1554703585; __gads=ID=a5304af261dcd4c2:T=1554703585:S=ALNI_MbEsW4VT55Z3ElVFspuBPVYP8Thwg; UM_distinctid=16a1ec92e0bca-04c3b65518f387-4d764416-100200-16a1ec92e0c138; __utma=226521935.1409714415.1554703585.1556871669.1556871669.1; __utmz=226521935.1556871669.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gid=GA1.2.839933537.1557463307; .Cnblogs.AspNetCore.Cookies=CfDJ8JcopKY7yQlPr3eegllP76ONU4DW7YjCWy40gI6hf9myPV5ajk7NVExemnfstB1tbKJF2OyOqLT2GisTVjvl4bcF-mn-4aXJ52G2n2gLQBobFwxenWoyAOvcGa13hlH32dXrq_PuOyuhgHXcH2nby0WfC7cPTAQTYOdCy7YW805ZP5e3ZT3ALtIt_0_2Vft90hmllh3jNhNPMlCJPejJlz2EXsNhEyWS1KjFXxJ84fNM_zoF6xhU7Kj4sCn9vHhfg3B7SznY0qaTT5f0uCEeIhrD1gZn4YPIP-QBLG2JVoA6O1gkhLwcl-Vw-WweKjuAS0VbMVgo4J_bF9qAVqWuphAWls-CdFR-dqwcIHA-Sj3PFpiKh1givGqsthFY4Pf6lZyRhCNU_Aopac48afzKxU9Oi6S-zeVx0iy07UmG7CF5vXFH4DT_Dtkmsam_Yd4nkQ; .CNBlogsCookie=70A254D254EC0CB9FD627D2B7FE7A278AF297DD82AD19D643B5D263438C4195FAF708B086BE305CC124EDD70B8A126ABF076B337AE4F8CCEB35326582BAE6F7A0565B1554A33A5982F7E83150ED7CB57F9173534; _gat=1' } proxies= get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, cookies=cookies) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print(e)
def getHtmlText(url): """ 获得网页源代码 :param url: 网页链接 :return: 网页内容 """ headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400', 'Referer': 'https://www.dota2.com.cn/news/index.htm', } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, timeout=5) r.raise_for_status() # print('编码:',r.encoding) return r.text except Exception as e: print(e)
def getHtmlText(self, url): """ 获得url网页响应数据 :param url: :return: text内容 """ headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3704.400 QQBrowser/10.4.3587.400', 'Origin': 'https://lianjia.com', } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies, timeout=10) r.raise_for_status() return r.text except Exception as e: print(e)
def get_comments(productId): ''' 获得每个产品10页评论 :param productId: :return: ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3719.400 QQBrowser/10.5.3715.400', 'referer': 'https://www.jd.com/2019', } url = 'https://sclub.jd.com/comment/productPageComments.action?' for page in range(0, 10): proxies = get_proxies() param_data = { 'callback': 'fetchJSON_comment98vv1216', 'productId': productId, 'score': 0, 'sortType': 5, 'page': page, 'pageSize': 10, 'isShadowSku': 0, 'fold': 1 } try: response = requests.get(url, params=param_data, headers=headers, proxies=proxies) response.raise_for_status() response = response.text #除去网页无用信息fetchJSON_comment98vv1216() new_resp = response.replace('fetchJSON_comment98vv1216(', "").replace(');', "") data = json.loads(new_resp) comments = data['comments'] p = Pool() p.map(save_into_mongo, comments) except Exception as e: print(e)
def getHtmlText(url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400', 'Referer': 'https://www.biqudao.com/bqge127585/7403388.html' } cookies = { 'cookie': "UM_distinctid=16ab10a567b541-08e6327450281f-4a764a16-100200-16ab10a567c4a8; CNZZDATA1275036968=1243908052-1557745098-https%253A%252F%252Fwww.baidu.com%252F%7C1557745098; bookid=127585; bcolor=; font=; size=; fontcolor=; width=; chapterid=7403387; chaptername=%25u7B2C%25u4E00%25u5343%25u4E00%25u767E%25u4E00%25u5341%25u4E8C%25u7AE0%25u771F%25u6B63%25u7684%25u7389%25u5E1D%25uFF08%25u4E0A%25uFF09" } proxies = get_proxies() try: r = requests.get(url, headers=headers, proxies=proxies) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print(e)
def getHtmlText(url, ipcount=3): proxies = get_proxies() headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3650.400 QQBrowser/10.4.3341.400', 'referer': 'https://weixin.sogou.com/', } try: r = requests.get(url, headers=headers, proxies=proxies, timeout=10) r.raise_for_status() return r.text except Exception as e: if ('443' in str(e)): if ipcount > 0: getHtmlText(url, ipcount=ipcount - 1) else: print('finish') else: print(e)
import getIpfromWeb import getProxy import time def main(): ip_list = getIpfromWeb.getIp() # verified_list = getIpfromWeb.verifyIP(ip_list) verified_list = getIpfromWeb.verifyip_multithread(ip_list) getIpfromWeb.data_persistence(verified_list) print('crawl verified ip list is : ', verified_list) if __name__ == '__main__': star = time.time() main() getIpfromWeb.refresh_db() print(getProxy.get_proxies()) end = time.time() print('用时:', end - star) #49.479830265045166 单线程 # 28.486629486083984 多线程
import getProxy import cssselect from bs4 import BeautifulSoup url = 'https://sx.fang.anjuke.com/loupan/yuechengqu/' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3650.400 QQBrowser/10.4.3341.400', 'Referer': 'https://sx.fang.anjuke.com/loupan/yuechengqu/', } cookies = { 'cookie': 'ctid=66; aQQ_ajkguid=DE7D9768-0C31-4978-1D21-SX0428133507; sessid=1ACFCBE2-691C-D1FA-3461-SX0428133507; isp=true; lps=http%3A%2F%2Fuser.anjuke.com%2Fajax%2FcheckMenu%2F%3Fr%3D0.8617607606270921%26callback%3DjQuery1113003545128319064261_1556429737063%26_%3D1556429737064%7Chttps%3A%2F%2Fsx.fang.anjuke.com%2Floupan%2F%3Fpi%3Dbaidu-cpcaf-sx-tyongsx1%26kwid%3D94538308235; twe=2; Hm_lvt_c5899c8768ebee272710c9c5f365a6d8=1556429737; 58tj_uuid=bd822b30-9606-4160-b237-b5d4e97b0bee; init_refer=https%253A%252F%252Fwww.baidu.com%252Fbaidu.php%253Fsc.Ks000001qLT2daZnZ-PnF6VfdHyzXE-ljY0UrvaFUKIRtpg9KJcd9o4fAIs3sHFn5I15dnr1cxvmqRXRvA2Fo_GSHwoy0YExi11QSADpD8TcHS8v_A8rvHYqMoR84_ePK0LVZrcC5YafivARH99CJDgO8zUPHBM3mU7GkjXDVYUvWFedqTpq8v7GfSFBLV_Z5Bv_RPU8MjGjGACins.DR_NR2Ar5Od66Wl5Qy1hthm_8jViBaBeKWuB6e3L_g_3_AXZZa1o1kqAX9CIhWl2SEUsmhPOu83erPh1FkLQDkELIrOklXrH-JuIz3qis1f_XPMZBC0.U1Yk0ZDqzI1fzeXOEP_0mywkXHLFLPjQVeStvsK9uZ7Y5Hc0TA-W5HD0IjLFLPjQzVaLEe1U0A-V5HczPfKM5yq-TZnk0ZNG5yF9pywdUAY0TA-b5Hnz0APGujYzP1T0UgfqnH0kPdtknjD4g1DsnHPxn10knNt1PW0k0AVG5H00TMfqP16L0ANGujY0mhbqnW0Y0AdW5H6srHRsPHRkPNtknj0kg1c4rHR1nHczn1IxnH01g100TgKGujY1n6Kkmv-b5HcLPsKzuLw9u1Ys0A7B5HKxn0K-ThTqn6KsTjYs0A4vTjYsQW0snj0snj0s0AdYTjYs0AwbUL0qn0KzpWYs0Aw-IWdsmsKhIjYs0ZKC5H00ULnqn0KBI1Yz0A4Y5H00TLCq0A71gv-bm1dsTzdCu0KYIgnqnHR4nHTzrHR3Pj6LrHbsn104Pj60ThNkIjYkPHRvPjc4PWb4nHcd0ZPGujY4ujFbPWFWuj0snjRLnjb30AP1UHdDwHwKfbuawbuDfbPKPDnY0A7W5HD0TA3qn0KkUgfqn0KkUgnqn0KlIjYs0AdWgvuzUvYqn7tsg1Kxn7ts0Aw9UMNBuNqsUA78pyw15HKxn7tsg1nkn1R3rjuxn0Ksmgwxuhk9u1Ys0AwWpyfqn0K-IA-b5iYk0A71TAPW5H00IgKGUhPW5H00Tydh5HDv0AuWIgfqn0KhXh6qn0Khmgfqn0KlTAkdT1Ys0A7buhk9u1Yk0Akhm1Ys0AqY5H00ULFsIjYsc10Wc10Wn0KWThnqn1nLrjc%2526word%253D%2525E7%2525BB%25258D%2525E5%252585%2525B4%2525E6%252588%2525BF%2525E4%2525BB%2525B7%2526ck%253D3354.1.58.325.187.335.173.595%2526shh%253Dwww.baidu.com%2526sht%253D95697632_hao_pg%2526us%253D1.0.1.0.0.0.0%2526bc%253D110101; new_uv=1; wmda_uuid=796de203d230c5efa426350a8e8970bd; wmda_new_uuid=1; wmda_session_id_8788302075828=1556429737519-93e198a7-972b-71b9; wmda_visited_projects=%3B8788302075828; als=0; new_session=0; __xsptplusUT_8=1; __xsptplus8=8.1.1556429738.1556430628.14%232%7Cwww.baidu.com%7C%7C%7C%25E7%25BB%258D%25E5%2585%25B4%25E6%2588%25BF%25E4%25BB%25B7%7C%23%23Czq2qvFzGKNhXO64wbNMLCm0VCYmDeIi%23; Hm_lpvt_c5899c8768ebee272710c9c5f365a6d8=1556430628' } proxies = getProxy.get_proxies() def getHtmlText(url): try: r = requests.get(url, headers=headers, proxies=proxies) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print(e) def getInfo(html, fname): wb = openpyxl.load_workbook(fname) soup = BeautifulSoup(html, 'html.parser')
import requests import time import random import openpyxl import sys sys.path.append( r'C:\Users\Administrator\Desktop\python\Py_webSpider\github\IP_pond\Improve' ) from getProxy import get_proxies headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3676.400 QQBrowser/10.4.3469.400', 'Referer': "https://movie.douban.com/tv/" } proxies = get_proxies() url = 'https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20' def getHTML(url): try: r = requests.get(url, headers=headers, proxies=proxies) r.raise_for_status() r.encoding = 'utf-8' return r.text except Exception as e: print(e) def getTVinfo(html, fname): try: