def _build_chrome_options(self, headless=True, random_user=False): chrome_options = Options() chrome_options.add_argument("--disable-notifications") chrome_options.add_argument("--verbose") chrome_options.add_argument("--window-size=1920x1080") chrome_options.add_argument("--no-sandbox") chrome_options.add_experimental_option( "prefs", { "download.default_directory": self.download_dir, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing_for_trusted_sources_enabled": False, "safebrowsing.enabled": False, }, ) chrome_options.add_argument("--disable-gpu") chrome_options.add_argument("--disable-software-rasterizer") if headless: chrome_options.add_argument("--headless") if random_user: ua = UserAgent(family="chrome") randomua = ua.random() chrome_options.add_argument(f"user-agent={randomua}") return chrome_options
def gen_news(): ua = UserAgent() user_agent = ua.random() referer = 'https://tushare.pro/login?next=%2Fnews%2Fnews_sina' headers = { 'User-Agent': user_agent, 'Host': 'tushare.pro', 'Origin': 'https://tushare.pro', 'Referer': referer } stockPageRequest = request.urlopen('http://finance.eastmoney.com/news/cdfsd.html') htmlTitleContent = str(stockPageRequest.read(), 'utf-8') # 正则匹配标题 titlePattern = re.compile('<span class="l3 a3">title="(.*?)"</span>', re.S) p_title = 'title="(.*?)"(.*?)' title = re.findall(p_title, htmlTitleContent) title = [t[0] for t in title if not t[0].find('【')] news = [] for t in title: a = t.find('【') b = t.find('】') news.append({'title': t[a+1:b], 'content': t[b+1:]}) # news = News.objects.all() return news
def test(): ua = UserAgent(family='chrome', os_family='linux') for i in range(100): res = ua.random() print(res)
def get_page(url, options={}): try: ua = UserAgent() except: pass try: base_headers = { 'User-Agent': ua.random(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8' } except: base_headers = { 'User-Agent': ua.random(), 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8' } headers = dict(base_headers, **options) print('Getting', url) try: r = requests.get(url, headers=headers) print('Getting result', url, r.status_code) if r.status_code == 200: return r.text except ConnectionError: print('Crawling Failed', url) return None
def set_options(): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--disable-setuid-sandbox") chrome_options.add_argument("--headless") chrome_options.add_argument("--window-size=1920x1080") ua = UserAgent(family='chrome') randomua = ua.random() chrome_options.add_argument(f'user-agent={randomua}') print(randomua) return chrome_options
def __init__(self): self.ua = UserAgent() self.headers = {'User-Agent': self.ua.random()} # ip代理API self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4®ions=' # redis数据库 self.redi = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True, password='******') # 接口请求失败计数 self.count = 0
def youtube_scrapper(self, query, number_results=2): "Function to scrape results from Youtube Search" query = urllib.parse.quote_plus(query) # Format into URL encoding ua = UserAgent(family='chrome') assert isinstance(query, str) #Search term must be a string assert isinstance(number_results, int) #Number of results must be an integer escaped_search_term = query.replace(' ', '+') google_url = "https://www.google.com/search?q={}&num={}".format( query + "+site:youtube.com", 1) #print(google_url) response = requests.get(google_url, {"User-Agent": ua.random()}) soup = BeautifulSoup(response.text, "html.parser") result_div = soup.find_all('div', attrs={'class': 'ZINbbc'}) self.Links = [] self.Titles = [] for r in result_div: # Checks if each element is present, else, raise exception try: link = r.find('a', href=True) title = r.find('div', attrs={'class': 'vvjwJb'}).get_text() # Check to make sure everything is present before appending if link != '' and title != '': self.Links.append(link['href']) self.Titles.append(title) if (len(self.Links) == number_results): break # Next loop if one element is not present except: continue for i in range(0, len(self.Links)): self.Links[i] = self.Links[i].replace("/url?q=", "") for i in range(0, len(self.Links)): if (self.Links[i].find("watch") != -1): self.Links[i] = self.Links[i].replace("%3F", "?") self.Links[i] = self.Links[i].replace("%3D", "=") self.Links[i] = self.Links[i].split("&")[0] else: continue if (len(self.Links) == 0): return else: for i in range(0, len(self.Links)): d = dict() d["title"] = self.Titles[i] d["linktopage"] = self.Links[i] self.youtube_result.append(d)
def spider(url): response = requests.get(url, {'User-Agent':UserAgent().random()})#用来突破反爬虫 res = response.content html = str(res, 'utf-8')#用来获取html页面 html_tree = bs(html, 'lxml') # 找class = wz_content标签下的内容 html_text = html_tree.find_all("div", class_="wz_content") All_text = [] for text in html_text: one_text = [] text_url = text.find('a')['href'] # 选取了当前文章的链接 text_title = text.find('h3') #标题 text_cout = text.find("span", class_="count") #舍弃http://youxian.cnki链接 打不开的 没数据 可能需要登陆才有数据 之后再调试吧 出现概率1/20 if re.match(r'http://www.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url) or re.match(r'http://cdmd.cnki.com.cn/Article/[a-zA-Z]+-[0-9a-zA-Z-]+.htm', text_url): # 调用函数 进去各个文章的具体网站 找其他信息 text_all = datespider(text_url) one_text.append(text_title.get_text().replace('\xa0', '').replace('\n', '')) # 得到文章的标题 one_text.append(text_cout.get_text().replace('\xa0', '').replace('\n', '').replace('下载次数', '').replace('被引次数', '').replace('(', '').replace(')', '')) # 把操作次数 放进列表 for item in text_all:#将datespider函数返回的信息,文章的 作者、单位、学位 、分类号,插入列表 one_text.append(item.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '').replace('年', '')) one_text.append(text_url) # 把文章的链接 放进列表 All_text.append(one_text) return All_text
class DoubanDownloaderMiddleware(object): def __init__(self): self.count = 0 '''无标题模式''' options = webdriver.ChromeOptions() options.add_argument('--headless') self.driver = webdriver.Chrome(chrome_options=options) self.driver.implicitly_wait(3) self.driver.get('https://www.douban.com') '''跳转到frame''' frame = self.driver.find_element_by_xpath('//body//div[@class="login"]/iframe') self.driver.switch_to.frame(frame) try: self.driver.find_element_by_xpath('//body/div[1]/div[1]/ul[1]/li[2]').click() time.sleep(0.5) self.driver.find_element_by_xpath('//input[@id="username"]').send_keys('xxx') self.driver.find_element_by_xpath('//input[@id="password"]').send_keys('xxx') self.driver.find_element_by_xpath('//div[@class="account-form-field-submit "]').click() '''设置等待响应时间''' time.sleep(1) #如果没找到对应元素,再找一遍 except NoSuchElementException as e: print('再加载一遍: %s' % e) self.driver.find_element_by_xpath('//body/div[1]/div[1]/ul[1]/li[2]').click() time.sleep(0.5) self.driver.find_element_by_xpath('//input[@id="username"]').send_keys('xxx') self.driver.find_element_by_xpath('//input[@id="password"]').send_keys('xxx') self.driver.find_element_by_xpath('//div[@class="account-form-field-submit "]').click() time.sleep(1) else: print('Successful logging!') def process_request(self, request, spider): ''' 特殊的标记,只是运行一次,用来登录 本来想实现翻页,但是没有成功,所以不设置也没问题。 ''' self.count += 1 if self.count <= 1: return HtmlResponse(url=request.url, status=200, request=request, encoding='utf-8', body=self.driver.page_source) #添加User-Agent,用了第三方库:my_fake_useragent else: ua = UserAgent(family='chrome', os_family='Windows') res = ua.random() request.headers['User-Agent'] = res '''下面也可以添加随机IP,要先在settings中写好代理池PROXIES''' '''
def random_header(): ua = UserAgent() random_header = json.loads(r'''{ "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.dogforum.com", "Upgrade-Insecure-Requests": "1", "User-Agent":"%s" }'''%ua.random) return random_header
def __init__(self): self.url = 'http://app.mi.com/categotyAllListApi?page={}&categoryId={}&pageSize=30' self.q = Queue() # 存放所有URL地址的队列 self.i = 0 self.id_list = [] # 存放所有类型id的空列表 # 打开文件 self.f = open('xiaomi.csv', 'a', newline="") self.writer = csv.writer(self.f) self.lock = Lock() # 创建锁 self.ua = UserAgent()
def getListProxies(): ip_list = [] session = requests.session() headers = {'User-Agent': UserAgent().random} page = session.get("http://www.xicidaili.com/nn", headers=headers) soup = BeautifulSoup(page.text, 'lxml') taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")}) for trtag in taglist: tdlist = trtag.find_all('td') proxy = {'http': 'http://' + tdlist[1].string + ':' + tdlist[2].string} ip_list.append(proxy) return ip_list
def __init__(self): self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action" self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}" self.headers = { "Accept": "application/json, text/javascript, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "169", "Content-Type": "application/x-www-form-urlencoded", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Origin": "http://wzzxbs.mofcom.gov.cn", "Referer": "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.detail_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.data = { "params.entpName": "", "page.currentPage": "", "page.limit": "2000", "page.option": "next", "page.start": "", "page.rowCount": "", "listGrid.col": "1:showRecordInfo(0),2,3,4", "listGrid.type": "link,ro,ro,ro" } self.detail_data = {"params.recordId": "", "time": ""} self.util = Util() self.user_agent = UserAgent()
def test_login(): """ 仅作返回cookies :return: """ data = {"username": "******", "password": 123456} url = "http://pre-admin.mofangcar.com/cms/login" headers = {"User-Agent": UserAgent().random()} # "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"} res = requests.post(url, data=data, headers=headers, verify=False) print(res.json()) # print(res.text) # print(res.cookies['JSESSIONID']) # 有的把cookie信息存在cookies中 有的放入返回的data中 return res.cookies
def postRequestListUrl(name,url,data,code,params): if name in listPageRequestHooks: if "headers" in listPageRequestHooks[name]: headers = listPageRequestHooks[name]['headers']() if "cookies" in listPageRequestHooks[name]: cookies = listPageRequestHooks[name]['cookies']() if "url" in listPageRequestHooks[name]: url = listPageRequestHooks[name]['cookies']() if "params" in listPageRequestHooks[name]: params = listPageRequestHooks[name]['params']() if "timeout" in listPageRequestHooks[name]: timeout = listPageRequestHooks[name]['timeout']() if "data" in listPageRequestHooks[name]: data = listPageRequestHooks[name]['data']() result=post(url,headers if "headers" in locals() else {"User-Agent":UserAgent().random()},timeout if "timeout" in locals() else 60,params=params if "params" in locals() else None,data=data,cookies=cookies if "cookies" in locals() else None,code=code) return result
class DownloadImg(): def __init__(self): self.ua = UserAgent() def download_one_img(self, img_url, saved_path): # 下载图片 header = { "User-Agent": "{}".format(self.ua.random().strip()), 'Connection': 'close'} r = requests.get(img_url, headers=header, stream=True) print("请求图片状态码 {}".format(r.status_code)) # 返回状态码 if r.status_code == 200: # 写入图片 with open(saved_path, mode="wb") as f: f.write(r.content) print("download {} success!".format(saved_path)) del r return saved_path
def datespider(date_url): #设置一下 UserAgent 突破反扒 response_try = requests.get(date_url, UserAgent().random()) # 用BeautifulSoup框架转化 response_tree = bs(response_try.text, 'html.parser') if(response_tree==None): return [] else: # 在对应位置 匹配需要的信息 res_date = response_tree.find("font", {"color": "#0080ff"}) res_name = response_tree.find("div", {"style": "text-align:center; width:740px; height:30px;"}) res_msg = response_tree.find("div", {"style": "text-align:left;"}) #时间 if res_date == None: response_date = None else: response_date = res_date.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '') #作者 if res_name == None: response_name = None else: response_name = res_name.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t', '') #其他信息 if res_msg == None: res_msg = None else: # 去除不想要的东西 response_msg = res_msg.get_text().replace('\xa0', '').replace('\r', '').replace('\n', '').replace('\t','')\ .replace('】', '').replace('学位授予单位:', '').replace('学位级别:', '').replace('作者单位:', '').replace('学位授予年份:','').replace('分类号:', '') #用“【”作为分割界限,将response_msg字符串 划分为 response_point列表 response_point = response_msg.split("【") #插入列表 并返回 response_All = [] response_All.append(response_date) response_All.append(response_name) #列表拼接 #列表拼接 for item in range(1,len(response_point)): response_All.append(response_point[item]) return response_All
from collections import defaultdict from urllib.parse import urlsplit, urljoin, urldefrag from datetime import datetime import threading import requests from dateutil.relativedelta import relativedelta from bs4 import BeautifulSoup, SoupStrainer from my_fake_useragent import UserAgent from my_python_module.cache_utils import cachedb, func_cache from my_python_module.datetime_utils import get_timestamp, get_dt_fromtimestamp from my_python_module.pathlib import mkdirs logger = logging.getLogger(__name__) ua = UserAgent(family=['chrome', 'firefox']) class URLType(Enum): """ refUrl: 除了Absolute URL,其他URL都需要根据本URL所在的文章的refUrl才能得到绝对URL """ Absolute = 1 # 'https://www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm' MissScheme = 2 # ’//www.cis.rit.edu/htbooks/nmr/chap-10/chap-10.htm‘ refUrl RelativeSite = 3 # ’/htbooks/nmr/chap-10/chap-10.htm‘ refUrl RelativeFolder = 4 # ’chap-10.html‘ refUrl RelativeArticle = 5
import re import requests from lxml import etree from my_fake_useragent import UserAgent import MySQLdb conn = MySQLdb.connect(host='127.0.0.1', port=3306, user='******', passwd='123', db='yunyun', charset='utf8') cursor = conn.cursor() a = UserAgent() p = a.random() headers = { 'User-Agent': p, # 'cookie': '__cfduid=dce1ed34975ff71acb9b22d4959d0263b1563521810; ASP.NET_SessionId=1oj0zvk0wttwcudymxjeftpt; UM_distinctid=16c0928d2b2448-03463007e150d9-e343166-144000-16c0928d2b32f6; CNZZDATA1255263807=653621382-1563520703-%7C1563520703; ViewHistory_4=1oj0zvk0wttwcudymxjeftpt; .ynzpauth=869D169A9273686FE3F281194E66EAF796DA177B8799BC0686C9AFD983575676620178F545B8CC60F7FEAA6886B258DF06E4D0E13BBE33ABBA3DCF46FB3A659EE847BBE2696F2256B15111D8D1BDD642178E9567CF7161BDEA9BC44159707D7DF2F8D7D349B8397F87AA820265CC36F284BFECA0EF6E38D76411703DA70E1B5EB03806C9211CD2EC6C800D8E4E9CC840A8734ACC7E31910E493DCF0B2D859E27; viewedResume=2088560%2C1515707%2C727002%2C1218946%2C1623681%2C2131167%2C2121066' } for i in range(2957, 10000): url = 'http://www.bole.com.cn/resume/resume-show.php?id=' + str(i) + '' # print(url) try: with requests.session() as s: a = s.get(url, headers=headers) pr = a.text # print(pr) pattern = re.compile('<div class="personal_info_item">(.*?)</div>') rev1 = pattern.findall(pr) # print(rev1)
import time import random import requests from my_fake_useragent import UserAgent from policy_crawl.common.logger import errorlog headers = {"User-Agent": UserAgent().random()} def get(url, params=None, headers=headers, code="utf-8", timeout=160, **kwargs): res = requests.get(url, params=params, headers=headers, timeout=timeout, **kwargs) if res.status_code in [200, 201, 301]: return res.content.decode(code) else: errorlog.logger.error("url status_code 错误:%s,status_code:%s" % (url, res.status_code)) raise ConnectionError("没有连接") def post(url, data=None, headers=headers, code="utf-8", timeout=160, **kwargs): res = requests.post(url, data=data,
def __init__(self): self.headers = {'User-Agent': UserAgent().random()} print(self.headers)
class IpPool: def __init__(self): self.ua = UserAgent() self.headers = {'User-Agent': self.ua.random()} # ip代理API self.ipurl = 'http://http.tiqu.qingjuhe.cn/getip?num=1&type=2&pack=51811&port=11&lb=1&pb=4®ions=' # redis数据库 self.redi = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True, password='******') # 接口请求失败计数 self.count = 0 # 获取代理ip def get_ip(self): try: res = requests.get(url=self.ipurl, headers=self.headers, timeout=10) print(res.status_code) print( '获取时间:{}'.format( str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time()))))), res.text) if res.status_code != 200: self.count += 1 else: self.count -= 1 # 接口返回数据 # {"code":0,"data":[{"ip":"223.241.61.18","port":"4336"}],"msg":"0","success":true} json_obj = res.json() if res.status_code == 200 and json_obj['data'][0]: if self.proxyip(json_obj['data'][0]['ip']): return json_obj['data'][0] # return {'ip': '127.0.0.1', 'port': '1234'} except: self.count += 1 # 存储ip def set_ip(self, ip): print('存入:', ip) self.redi.lpush('ip:iplist', json.dumps(ip)) # 检测IP有效性 def test_ip(self, item): item = json.loads(item) try: telnetlib.Telnet(item['ip'], port=item['port'], timeout=10) except: return False else: return True def proxyip(self, ip): url = 'https://iphunter.net/ip/{}'.format(ip) headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36' } res = requests.get(url, headers=headers) e = etree.HTML(res.text) data = ''.join(e.xpath('/html/body/article/script[3]/text()')) if '代理' not in data and '爬虫' not in data: return True else: return False # 引擎 def engine(self): while True: if self.redi.llen('ip:iplist') >= 19: for item in self.redi.lrange('ip:iplist', 0, -1): print( '检测时间:{}'.format( str( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int( time.time()))))), item) if item == None: print(None) # 清除无效IP self.redi.lrem('ip:iplist', 1, item) # # 补充有效IP time.sleep(2) ip = self.get_ip() if ip: self.set_ip(ip) if not self.test_ip(item): print(self.test_ip(item)) # 清除无效IP self.redi.lrem('ip:iplist', 1, item) # # 补充有效IP time.sleep(2) ip = self.get_ip() if ip: self.set_ip(ip) else: for i in range(20): time.sleep(2) if self.redi.llen('ip:iplist') <= 20: print('ip数量小于20') ip = self.get_ip() if ip: self.set_ip(ip) time.sleep(30) # 客户端随机ip def random_ip(self): try: iplist = self.redi.lrange('ip:iplist', 0, -1) except: iplist = [] if iplist: while True: ip = random.choice(iplist) if ip: ip = json.loads(ip) # ip_info = '183.166.164.209:4370' ip_info = ip['ip'] + ':' + ip['port'] proxies = {'https': ip_info} return ip_info # proxies = {'https': '119.5.74.242:4385'} else: return None # 运行 def run(self): pid = str(os.getpid()) self.redi.set('pid:ip_pool', pid) self.engine()
from bs4 import BeautifulSoup import requests import csv from my_fake_useragent import UserAgent # Mimic the access to the website like a browser ua = UserAgent(family='chrome') BrowserUserAgent = ua.random() # Define URL and Requests object f = csv.writer(open('drug-names.csv', 'w')) f.writerow(['Name']) pages = [] headers = BrowserUserAgent firstAlphaNumeric = [ 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '0-9', '' ] # secondAlphaNumeric = firstAlphaNumeric finalList = [] for first in firstAlphaNumeric: for second in firstAlphaNumeric: url = 'https://www.drugs.com/alpha/' + str(first) + str( second) + '.html' pages.append(url) for item in pages: page = requests.get(item, headers) soup = BeautifulSoup(page.text, 'html.parser')
def get_request_headers(): ua = UserAgent() return {"User-Agent": ua.random()}
proxy = ip_list[0] error = 0 while 1: queue_len = r.llen('queue:ftshop') queue_index = 0 s = requests.session() n = str(q.get_nowait(), encoding='utf8') data = json.loads(n) shopid = data['shopid'] region = data['region'] area = data['area'].encode("utf-8").decode("utf-8") headers = { 'User-Agent': UserAgent().random, 'Referer': 'https://m.dianping.com/shenzhen/ch10/r{0}'.format(region_dict[area]) } url = 'https://m.dianping.com/shop/' + shopid try: respon = s.get(url, headers=headers, proxies=proxy) except Exception as e: error = 1 i = 0 while '验证中心' in respon.text or '抱歉!页面暂' in respon.text or respon.status_code != 200 or error == 1: i = i + 1 if i < len(ip_list): proxy = ip_list[i] try: respon = s.get(url, headers=headers, proxies=proxy)
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)', 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET CLR 3.5.30729)' ] user_agent = random.choice(user_agent_list) headers = {'User-Agent': user_agent} ua = UserAgent() def Organizer(request): lr = LowRange() P = [] def LowRange(): Alink = r'https://www.amazon.in/s?i=electronics&bbn=1805560031&rh=n%3A976419031%2Cn%3A976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cn%3A1805560031%2Cp_36%3A1318506031%2Cp_89%3AAsus%7CHUAWEI%7CHonor%7CLenovo%7CMi%7CMotorola%7CNokia%7COPPO%7CRedmi%7CSamsung%7CVivo%7CXiaomi%7Crealme%2Cp_n_operating_system_browse-bin%3A1485077031%2Cp_72%3A1318476031%2Cp_n_feature_seven_browse-bin%3A8561133031%2Cp_n_feature_eight_browse-bin%3A8561112031%7C8561116031%2Cp_n_feature_three_browse-bin%3A1897963031%2Cp_n_condition-type%3A8609960031%2Cp_n_feature_five_browse-bin%3A8561106031%2Cp_n_feature_two_browse-bin%3A1898707031%2Cp_n_pct-off-with-tax%3A2665399031%2Cp_n_feature_thirteen_browse-bin%3A8561102031&dc&fst=as%3Aoff&qid=1567887652&rnid=8561098031&ref=sr_nr_p_n_feature_thirteen_browse-bin_2' Flink = r'https://www.flipkart.com/mobiles/smartphones~trunype/pr?sid=tyy%2C4io&p%5B%5D=facets.processor_brand%255B%255D%3DSnapdragon&p%5B%5D=facets.sim_type%255B%255D%3DDual%2BSim&p%5B%5D=facets.internal_storage%255B%255D%3D64%2B-%2B127.9%2BGB&p%5B%5D=facets.number_of_cores%255B%255D%3DOcta%2BCore&p%5B%5D=facets.rating%255B%255D%3D4%25E2%2598%2585%2B%2526%2Babove&p%5B%5D=facets.price_range.from%3D10000&p%5B%5D=facets.price_range.to%3D20000&p%5B%5D=facets.availability%255B%255D%3DExclude%2BOut%2Bof%2BStock&p%5B%5D=facets.type%255B%255D%3DSmartphones&p%5B%5D=facets.operating_system%255B%255D%3DAndroid&p%5B%5D=facets.screen_size%255B%255D%3D6%2Binch%2B%2526%2Babove&p%5B%5D=facets.battery_capacity%255B%255D%3D5000%2BmAh%2B%2526%2BAbove&p%5B%5D=facets.battery_capacity%255B%255D%3D4000%2B-%2B4999%2BmAh&p%5B%5D=facets.clock_speed%255B%255D%3D2.5%2BGHz%2B%2526%2BAbove&p%5B%5D=facets.clock_speed%255B%255D%3D2%2B-%2B2.5%2BGHz&p%5B%5D=facets.network_type%255B%255D%3D4G%2BVOLTE&p%5B%5D=facets.brand%255B%255D%3DSamsung&p%5B%5D=facets.brand%255B%255D%3DMi&p%5B%5D=facets.brand%255B%255D%3DHonor&p%5B%5D=facets.brand%255B%255D%3DHuawei&p%5B%5D=facets.brand%255B%255D%3DMotorola&p%5B%5D=facets.brand%255B%255D%3DOPPO&p%5B%5D=facets.brand%255B%255D%3DNokia&p%5B%5D=facets.brand%255B%255D%3DVivo&p%5B%5D=facets.brand%255B%255D%3DPOCO&p%5B%5D=facets.ram%255B%255D%3D4%2BGB&p%5B%5D=facets.ram%255B%255D%3D6%2BGB%2B%2526%2BAbove&p%5B%5D=facets.serviceability%5B%5D%3Dfalse' Slink = r'https://www.snapdeal.com/products/mobiles-mobile-phones/filters/Form_s~Smartphones?sort=plrty&q=Form_s%3ASmartphones%7CPrice%3A10000%2C20000%7CBrand%3AVivo%5EOppo%5EMI%5EMoto%7CRAM_s%3A4%20GB%5E6%20GB%7CConnectivity_s%3AVoLTE%7CScreensize_s%3A6.0%20%26%20Above%7CPrimaryCamera_s%3A8MP-13MP%7C' linklist = {Alink: AV.Hello, Flink: FV.Hello, Slink: SV.Hello}
def __init__(self): self.ua = UserAgent()
import requests from my_fake_useragent import UserAgent import json from pymongo import MongoClient from pyquery import PyQuery as pq import random import time ua = UserAgent() headers = {"User-Agent": ua.random()} client = MongoClient(host="localhost", port=27017) collection = client["发改委"]['辽宁1'] def parse_detail(html, url): ret = {} doc = pq(html) ret['url'] = url ret['title'] = doc(".news-content-main h1").text() ret['sourceTime'] = doc(".news-info").text() ret['content'] = doc('#ContTextSize').text() ret['contentUrl'] = doc("#ContTextSize a").attr("href") print(ret) collection.insert_one(ret) def parse_index(html): doc = pq(html) items = doc(".mod-body2 ul li").items() for item in items:
def crawl(self): ua = UserAgent() headers = {'User-Agent': '{}'.format(ua.random())} print(self.spider_name, 'now crawling', self.url_key) try: raw_contents = requests.get(self.url, headers=headers).text match_pattern = r'<td(.*?)</td>' level_1_soup_list = re.findall(match_pattern, raw_contents, re.S | re.M) level_2_soup_list = [] for level_1_soup in level_1_soup_list: level_2_soup = level_1_soup.split('>')[1] level_2_soup_list.append(level_2_soup) project_name = level_2_soup_list[1] project_number = level_2_soup_list[3] project_intro = level_2_soup_list[5] project_link = level_2_soup_list[7].split('\"')[ 1] # Special Design project_purpose = level_2_soup_list[9] project_size = level_2_soup_list[11] project_duration = level_2_soup_list[13] project_apr = level_2_soup_list[15] project_repay_start = level_2_soup_list[17] project_repay_method = level_2_soup_list[19].strip( ) # Special Design project_repay_details = level_2_soup_list[21] project_status = level_2_soup_list[23].strip() # Special Design project_raise_start = level_2_soup_list[25] project_guarantee = level_2_soup_list[27] project_repay_source = level_2_soup_list[29] project_risk = level_2_soup_list[31] project_expense = level_2_soup_list[33] project_template_number = level_2_soup_list[35] project_lender_notice = level_2_soup_list[37] project_borrower_type = level_2_soup_list[39].strip( ) # Special Design project_borrower_name = level_2_soup_list[43] project_document_type = level_2_soup_list[45].strip( ) # Special Design project_document_number = level_2_soup_list[47] project_borrower_job = level_2_soup_list[49] project_borrower_other_info = level_2_soup_list[51] project_borrower_credit = level_2_soup_list[53] project_borrower_default_times = level_2_soup_list[55] project_borrower_default_amounts = level_2_soup_list[57] project_borrower_income_and_debt = level_2_soup_list[59] self.list_of_attribute = [ self.url_key, project_name, project_number, project_intro, project_link, project_purpose, project_size, project_duration, project_apr, project_repay_start, project_repay_method, project_repay_details, project_status, project_raise_start, project_guarantee, project_repay_source, project_risk, project_expense, project_template_number, project_lender_notice, project_borrower_type, project_borrower_name, project_document_type, project_document_number, project_borrower_job, project_borrower_other_info, project_borrower_credit, project_borrower_default_times, project_borrower_default_amounts, project_borrower_income_and_debt ] print(self.spider_name, 'has finished the crawling from', self.url_key) except: project_name = "FAIL" project_number = "FAIL" project_intro = "FAIL" project_link = "FAIL" project_purpose = "FAIL" project_size = "FAIL" project_duration = "FAIL" project_apr = "FAIL" project_repay_start = "FAIL" project_repay_method = "FAIL" project_repay_details = "FAIL" project_status = "FAIL" project_raise_start = "FAIL" project_guarantee = "FAIL" project_repay_source = "FAIL" project_risk = "FAIL" project_expense = "FAIL" project_template_number = "FAIL" project_lender_notice = "FAIL" project_borrower_type = "FAIL" project_borrower_name = "FAIL" project_document_type = "FAIL" project_document_number = "FAIL" project_borrower_job = "FAIL" project_borrower_other_info = "FAIL" project_borrower_credit = "FAIL" project_borrower_default_times = "FAIL" project_borrower_default_amounts = "FAIL" project_borrower_income_and_debt = "FAIL" self.list_of_attribute = [ "FAIL", project_name, project_number, project_intro, project_link, project_purpose, project_size, project_duration, project_apr, project_repay_start, project_repay_method, project_repay_details, project_status, project_raise_start, project_guarantee, project_repay_source, project_risk, project_expense, project_template_number, project_lender_notice, project_borrower_type, project_borrower_name, project_document_type, project_document_number, project_borrower_job, project_borrower_other_info, project_borrower_credit, project_borrower_default_times, project_borrower_default_amounts, project_borrower_income_and_debt ] print(self.spider_name, "has failed and gives", self.url_key, "to another spider")
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author: Cedar # @Date : 2021/3/22 # @Desc : from my_fake_useragent import UserAgent ua = UserAgent(phone=True) print(ua.random())