def get_proxy_list(): request_obj = request.Request(url='http://www.xicidaili.com/', headers={'User-Agent': UA().random}) response = request.urlopen(request_obj) html = str(response.read(), encoding='utf-8') xml = etree.HTML(html) ip_list = xml.xpath('//tr[@class="odd" or @class]/td[2]/text()') port_list = xml.xpath('//tr[@class="odd" or @class]/td[3]/text()') type_list = xml.xpath('//tr[@class="odd" or @class]/td[6]/text()') endure_list = xml.xpath('//tr[@class="odd" or @class]/td[7]/text()') last_check_list = xml.xpath('//tr[@class="odd" or @class]/td[8]/text()') proxy_list = [] proxy = namedtuple('proxy', ['ip', 'port', 'type', 'endure', 'last_check']) for i in range(len(ip_list)): proxy_list.append( proxy(ip_list[i], port_list[i], type_list[i], get_time(endure_list[i]), get_time(last_check_list[i]))) # 按照最后验证时间排序,将最有可能连接成功的放在前面 proxy_list.sort(key=lambda item: item.last_check) return proxy_list
# The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent': UA().random } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'tencent_positions.middlewares.TencentPositionsSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'tencent_positions.middlewares.TencentPositionsDownloaderMiddleware': 543, #} # Enable or disable extensions
# encoding='utf-8' import requests import random from selenium import webdriver as web from fake_useragent import UserAgent as UA from lxml import etree import urllib.request as ur import time ua = UA() chromePath = "d:/chromedriver.exe" driver = web.Chrome(chromePath) url = 'https://www.pornhub.com/view_video.php?viewkey=ph5cf931e03d669' driver.get(url) # tag = driver.find_element_by_class_name("mhp1138_btn mhp1138_volume-low mhp1138_icon mhp1138_icon-volume-low") # print(tag) driver.minimize_window() print('-'*30,'开始下载') tag = driver.find_element_by_class_name("mhp1138_videoWrapper") downloadFileURL = tag.find_element_by_tag_name("source").get_attribute("src") driver.quit() # 获取下载文件的长度 resposeFile = requests.get(url=downloadFileURL,stream=True) downFileSize = eval(resposeFile.headers['content-length']) print('文件大小:',downFileSize) print('文件类型:',type(downFileSize)) # 开始下载文件 size = 0
if("方式" in sstr): tmp_rent.rent_type = sstr.split(':')[1] def getdate(beforeOfDay): # 获取前1天或N天的日期,beforeOfDay=1:前1天;beforeOfDay=N:前N天 today = datetime.datetime.now() # 计算偏移量 offset = datetime.timedelta(days=-beforeOfDay) # 获取想要的日期的时间 re_date = (today + offset).strftime('%Y-%m-%d') return re_date headers = { 'Accept': 'text/html, application/xhtml+xml, image/jxr, */*', 'Accept - Encoding': 'gzip, deflate', 'Accept-Language': 'zh-Hans-CN, zh-Hans; q=0.5', 'Connection': 'Keep-Alive', 'User-Agent': UA().random # 获取随机的useragent } # ua = UserAgent() # posC = "C:\Download\small App\WebDriver\chromedriver.exe" # s = Session(webdriver_path=posC, # browser='chrome', # default_timeout=15, # webdriver_options={'arguments': ['headless','--no-sandbox','--disable-gpu']}) # # 'arguments': ['headless','--no-sandbox','--disable-gpu'] # # response=s.get('http://www.yeeyi.com/bbs/forum.php?mod=viewthread&tid=4523663',headers={'User-Agent':ua.chrome}).text # pages=10 # url = "http://www.yeeyi.com/bbs/forum.php?mod=viewthread&tid=4521587" # print(1) # s.driver.get(url) # print(1)
class TopMoviesSpider(CrawlSpider): name = 'douban' allowed_domains = [ 'www.douban.com', 'movie.douban.com', 'accounts.douban.com' ] start_urls = [ 'https://movie.douban.com/top250?start=0', ] # 生成随机的User Agent headers = {'User-Agent': UA().random} # 定义页面提取规则 page_extractor = LinkExtractor(allow=('start=\d*', )) rules = [Rule(page_extractor, follow=True, callback='parse_movie_info')] # 开始处理请求,首先发起登陆请求 def start_requests(self): return [ scrapy.FormRequest(url="https://accounts.douban.com/login", headers=self.headers, meta={"cookiejar": 1}, callback=self.login_douban) ] # 从响应的登陆页面获取captcha-id,这是之后发送post请求时的必要参数之一 def login_douban(self, response): # 获取captcha-id与验证码的图片地址 captcha_id = response.xpath( '//input[@name="captcha-id"]/@value').extract() # 准备基本的formdata formdata = { 'source': 'None', 'form_email': input('请输入账户名:'), 'form_password': input('请输入密码:'), 'login': '******' } if captcha_id: captcha_id = captcha_id[0] captcha_url = response.xpath( '//img[@id="captcha_image"]/@src').extract()[0] # 有验证码图片地址则去下载验证码图片到本地 captcha_path = os.path.dirname( os.path.dirname(__file__)) + '/captcha-images/captcha.jpg' urlretrieve(captcha_url, captcha_path) # 确保验证码图片下载成功 try: image = Image.open(captcha_path) image.show() captcha_solution = input('请输入验证码: ') formdata['captcha-id'] = captcha_id formdata['captcha-solution'] = captcha_solution except FileNotFoundError: pass # 登陆完成后,携带cookie访问待爬取页面 return scrapy.FormRequest.from_response( response, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, formdata=formdata, callback=self.start_parse) # 开始爬取电影页面,一旦页面中有符合规则的数据就调用其回调函数进行处理,产生item def start_parse(self, response): for url in self.start_urls: yield self.make_requests_from_url(url) def parse_movie_info(self, response): movies_in_page = response.xpath('//div[@class="info"]') for each in movies_in_page: item = items.DoubanMovieItem() item['title'] = each.xpath( './/span[@class="title"][1]/text()').extract()[0].replace( '\n', '').replace('\xa0', '').strip() item['info'] = each.xpath( './div[@class="bd"]/p/text()').extract()[0] item['score'] = each.xpath( './/div[@class="star"]/span[@class="rating_num"]/text()' ).extract()[0] quote = each.xpath('.//p[@class="quote"]/span/text()').extract() if quote: item['quote'] = quote[0].replace('\xa0', '').strip() else: item['quote'] = '' yield item