class Spider(): # 定义Spider类 base_url = 'http://weixin.sogou.com/weixin' # 设置全局变量 keyword = 'NBA' headers = { # 请求头,浏览器登录账号,开发者工具里将请求头复制出来,带上Cookie字段,才能爬取100页内容 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh=TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'IPLOC=CN1100; SUID=6FEDCF3C541C970A000000005968CF55; SUV=1500041046435211;' 'ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1;' 'JSESSIONID=aaar_m7LEIW-jg_gikPZv; Id=WKllllllll2BzGMVlllllV0o8cUlllll5G@HbZllll9lllllRklll5' '@@@@@@@@@@', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests':'1', 'User-Agent': 'Mozilla/5.0(Macintosh; Intel Mac OS X 10_12_3)AppleWebKit/537.36(KHTML, likeGecko)' 'Chrome/59.0.3071.115 Safari/537.36' } session = Session() # 初始化Session,执行请求 queue = RedisQueue() # 初始化RedisQueue对象,存储请求 def start(self): """ 初始化工作 :return: """ # 全局更新Headers,使得所有请求都能应用Cookies self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True)#改URL构造WeixinRequest # 回调函数是Spider类的的parse_index方法,当请求成功后,用parse_index来处理和解析,need_proxy参数设置为True,需要代理 # 调度第一个请求 self.queue.add(weixin_request) # 调用RedisQueue的add方法,将请求加入队列,等待调度
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Accept': 'text/html,application/xhtml+xml,application/xml;\ q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 'Accept-Encoding': 'gzip,deflate,br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'SMYUV=1556302359983345;ABTEST=2|1560665246|v1;IPLOC=CN4201;SUID=ADB93971771A910A000000005D05DC9E;\ SUID=ADB939712F20910A000000005D05DCA5;weixinIndexVisited=1;sct=1;SNUID=C9DC5D156461EE0FD7DF3DEE65B24370;JSESSIONID=aaaMCpeCkX1I86hi_hiRw', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) \ Chrome/74.0.3729.108 Safari/537.36', } session = Session() queue = RedisQueue() def start(self): """ 初始化工作 :return: """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request)
import pymongo import threading chrome_options = webdriver.ChromeOptions() chrome_options.add_argument( '--user-data-dir=C:\\Users\\lx\\AppData\\Local\\Google\\Chrome\\User Data') browser = webdriver.Firefox() wait = WebDriverWait(browser, 10) MONGO_URL = 'localhost' MONGO_DB = 'wenshu' MONGO_COLLECTION = 'wenshu' client = pymongo.MongoClient(MONGO_URL) db = client[MONGO_DB] queue = RedisQueue() URL = 'http://wenshu.court.gov.cn/list/list/?sorttype=1&number=GKXHF5CE&guid=0aef5a25-a03b-3dfcb27f-7af4bb80f708&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2+++%E4%B8%AD%E7%BA%A7%E6%B3%95%E9%99%A2:%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2' PAGE = 10 def index_page(): # 爬取页面的超链接并放入redis,并且点击下一页,(看能否使用多线程,一个爬主页,一个存信息T) try: url = 'http://wenshu.court.gov.cn/list/list/?sorttype=1&number=GKXHF5CE&guid=0aef5a25-a03b-3dfcb27f-7af4bb80f708&conditions=searchWord+2+AJLX++%E6%A1%88%E4%BB%B6%E7%B1%BB%E5%9E%8B:%E6%B0%91%E4%BA%8B%E6%A1%88%E4%BB%B6&conditions=searchWord+%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2+++%E4%B8%AD%E7%BA%A7%E6%B3%95%E9%99%A2:%E5%B9%BF%E4%B8%9C%E7%9C%81%E6%B7%B1%E5%9C%B3%E5%B8%82%E4%B8%AD%E7%BA%A7%E4%BA%BA%E6%B0%91%E6%B3%95%E9%99%A2' browser.get(url) wait.until( EC.text_to_be_present_in_element(( By.CSS_SELECTOR, '#resultList > div:nth-child(1) > table > tbody > tr:nth-child(2) > td > div' ), '广东')) time.sleep(5)
class Spider(): count = 1 base_url = 'http://weixin.sogou.com/weixin' proxypool_url = PROXYPOOL_URL keyword = KEYWORD headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'ABTEST=5|1532314313|v1; SUID=B50142311F2D940A000000005B5542C9; weixinIndexVisited=1;' ' ppinf=5|1532326946|1533536546|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxODolRTUlQTUlQkQlRTYl' 'ODMlQjN8Y3J0OjEwOjE1MzIzMjY5NDZ8cmVmbmljazoxODolRTUlQTUlQkQlRTYlODMlQjN8dXNlcmlkOjQ0Om85dDJsdU1zcUd' 'LUkZhNFhHUFZDY05WcVg4Nk1Ad2VpeGluLnNvaHUuY29tfA; pprdig=l7bPSuHEX2Dw59St_Hr2jsd9yOCiEQg-SINIWoJwCTf' '2NQ4D7oVXLanLnrvYbyRy_v1-ELWd_AxHVeBrAv0m6MNA_sLeRYd4rZK6oGkl7MuMcIwLsO1LNymIEDQyzrO5EKUiD6XDGKr5nS' 'v3-FK2IT2yKUXHdv_CHpYLO507QTc; sgid=20-36194065-AVtVdCKIibiaGVBqLwI9ItDZU; UM_distinctid=164c81479f' '755c-0932f36bc02d29-5b193613-144000-164c81479f8640; CNZZDATA1261666818=1219770327-1532362355-%7C153' '2362355; IPLOC=CN3202; SUID=3E0442313108990A000000005B56963C; SUV=00771DED3142043E5B56963D70C16776;' ' sct=5; SNUID=142E681B2B2F5B9FDD32825E2B9C0829; JSESSIONID=aaatZ36oaO6QKh8i9qHsw; ppmdig=1532422395' '00000000ab72fee141c5ddb0e58074d76e7065', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)' ' Chrome/67.0.3396.99 Safari/537.36' } session = Session() queue = RedisQueue() mysql = Mysql() mongo = MongoDB() #初始化工作 def start(self): #全局更新headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) #调度第一个请求 self.queue.add(weixin_request) #开始调度请求 def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.send_request(weixin_request) if response and response.status_code == 200: results = list(callback(response)) if results: for result in results: print('New Result', result) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): #self.mysql.insert(result) self.mongo.insert(result) else: print('获得的页面不正确') self.error(weixin_request) else: self.error(weixin_request) #执行请求 def send_request(self, weixin_request): try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) #注意此处一定要允许重定向,否则状态码为301 return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=True) except (ConnectionError, ReadTimeout) as e: print(e.args) return False #从代理池获取代理 def get_proxy(self): try: response = requests.get(self.proxypool_url) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except ConnectionError: return None #解析索引页 def parse_index(self, response): doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if self.count < MAX_PAGE: if next: self.count += 1 url = self.base_url + next weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request #解析详情页 def parse_detail(self, response): doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': ''.join((doc('.rich_media_content').text()).split()), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data #错误处理 def error(self, weixin_request): weixin_request.fail_time += 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request)
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = '世界杯' headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive', 'Cookie': 'SNUID=ABDB5DA1D1D4A1E78684A865D107AE0F; IPLOC=CN6101; SUID=7A0B8C715F20940A000000005B3D82E1; SUV=1530757856027923; ABTEST=0|1530757927|v1; weixinIndexVisited=1; sct=1; JSESSIONID=aaaEVzTuhjDDbeEJCJgrw; ppinf=5|1530758022|1531967622|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5Ok03NzIzMjkxN3xjcnQ6MTA6MTUzMDc1ODAyMnxyZWZuaWNrOjk6TTc3MjMyOTE3fHVzZXJpZDo0NDpvOXQybHVQbHhIcjJGZEh6UWxtWTk2elNSSzdnQHdlaXhpbi5zb2h1LmNvbXw; pprdig=nllAYaYxssp0hiUDLbEvvzmxf01k-Yp_ap-DE9ySNTT_ml1urWFbceFAl3tDw8mIzO-xRANMxd1RyOjH4hBYnHTtdad7i4cMcKCToqIkuNgoVg-v8hRMUAthv-42GI5QRC3QD5j-jVdSJ26-_0xZfS2YrhmYnKXvtpItdZpUI6I; sgid=13-35854231-AVs9g4blTLWoo7vKJzNYu4g; ppmdig=15307580220000006976bdc9e221344757bb35c7bc99b93f', 'Host': 'weixin.sogou.com', 'Referer': 'http://weixin.sogou.com/weixin?query=世界杯&_sug_type_=&sut=6402&lkt=4%2C1530757936771%2C1530757943164&s_from=input&_sug_=y&type=2&sst0=1530757943266&page=1&ie=utf8&w=01019900&dr=1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() #获取随机代理 # PROXY_POOL_URL = 'http://127.0.0.1:5555/random' def get_proxy(self): try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): #全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) #调度第一个请求 self.queue.add(weixin_request) # from pyquery import Pyquery as pq #回调函数,解析索引页 def parse_index(self, response): doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() #本页所有的链接 for item in items(): url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') #下一页 if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request #解析详情页 def parse_detail(self, response): doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#publish_time').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text() } yield data # from requests import ReadTimeout, ConnectionError def request(self, weixin_request): try: #判断是否需要代理 if weixin_request.need_proxy: #获取代理 proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } #调用Session的send()方法执行请求,请求调用了prepare()方法转化为Prepared Request return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False #错误处理 def error(self, weixin_request): weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Faild', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) #调度请求 # VALID_STATUES = [200] def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', result) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) #入口 def run(self): self.start() self.schedule()
class Spider(): base_url = 'https://weixin.sogou.com/weixin?' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36', 'Host': 'weixin.sogou.com', 'Cookie': 'CXID=DC6BC45CC7C377F723DD3A443FE31E4C; SUID=173F49DF3565860A5D15F0D5000657EA; wuid=AAGPCPd4KAAAAAqLFBtmTQ0AGwY=; SUV=008FECF4DF4AD4815D27703DD6AB0621; ABTEST=6|1565489065|v1; IPLOC=CN4451; weixinIndexVisited=1; JSESSIONID=aaaMUF5foWevaeAF9kpXw; sct=3; ppinf=5|1566048243|1567257843|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxMTplY2hveHh6aGFuZ3xjcnQ6MTA6MTU2NjA0ODI0M3xyZWZuaWNrOjExOmVjaG94eHpoYW5nfHVzZXJpZDo0NDpvOXQybHVFa1JhOHg0VlN1akEyVWFfWm5jLUlRQHdlaXhpbi5zb2h1LmNvbXw; pprdig=DBBFR2RZZygrUSj6gi2wMUB1mVAlkg1-pTbbkU6YY8rwyrLS6hOyRYi6q8XatVyTLI17Yow3q-RsViuhloTsy7OAJSg2B0PDjiWmSpT53CWn12TzQvcHyhBz9CboxgT-HyjGlyaHZXc5_nX2IY5O8daePYZ5OTh_8FUwTbElygg; sgid=29-42669099-AV1XicicOCeeX2kdrUcM6AiaTo; ppmdig=15660482440000000132fe6a9586fc74c7d7bd1e47980e9d; PHPSESSID=m6o5qn0mr83kplu2nbhu20i2j0; SNUID=3EBF21B46A6EFB3A61BBFA7F6BB0F6F8; successCount=1|Sat, 17 Aug 2019 13:30:39 GMT', 'Connection': 'keep-alive' } session = requests.Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): ''' 获取随机代理 :return: 代理的IP ''' try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('获取到代理', response.text) return response.text except requests.ConnectionError: print('出错了') return None def start(self): self.session.headers.update(self.headers) # 更新 headers 参数 start_url = self.base_url + urlencode(the_dict) # 拼接url weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request) # 将请求对象添加到队列中 def parse_index(self, response): ''' 解析索引页 :param response: 响应 :return: 新的响应 ''' doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) # 列表页的详情页链接 yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) # 拼接下一页的URL weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) # 下一页的详情页链接 yield weixin_request def parse_detail(self, response): ''' 解析详情页 :param response: 响应 :return: 微信公众号文章 ''' doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): ''' 执行请求的具体方法 :param weixin_request: :return: 响应 ''' try: if weixin_request.need_proxy: # 实例化对象时,设置是否需要代理,如果是,进入这个控制语句。差别在于是否允许重定向 proxy = self.get_proxy() if proxy: proxies = { 'http:': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): ''' 错误处理 :param weixin_request: :return: ''' weixin_request.fail_time = weixin_request.fail_time + 1 # 增加重试时间 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: # 如果重试少于最大限制,重新加回对列 self.queue.add(weixin_request) def schedule(self): ''' 调度模块 :return: ''' while not self.queue.empty(): # 如果列表长度不为空 weixin_request = self.queue.pop() # 取出一个 callback = weixin_request.callback # callback 被定义为解析列表页的方法 print(weixin_request.url) # print('正在调度:', weixin_request) response = self.request(weixin_request) print(response) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) # 解析列表页 if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: # 没有内容,报出错误 self.error(weixin_request) def run(self): ''' 函数入口 :return: ''' self.start() self.schedule()
class Wechat(object): base_url = "http://weixin.sogou.com/weixin" keyword = "nba" headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1.1 Safari/605.1.15', "Cookie": 'sct=2; ppmdig=153798156900000029b2c90dd10104a88b02afad382cf017; ppinf=5|1537799457|1539009057|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZToxMTptcmxvbmVseWp0cnxjcnQ6MTA6MTUzNzc5OTQ1N3xyZWZuaWNrOjExOm1ybG9uZWx5anRyfHVzZXJpZDo0NDpvOXQybHVGUEpyZHBxQ0dWR09VdG4xb1ZveU5nQHdlaXhpbi5zb2h1LmNvbXw; pprdig=YrgiDGJBorgcLoekzZXOYzQGDDiQ1I06___HgQ82ouWk1pkxaD2ON7U0nMVTZ7cKn6QkbWSGMo2frdWi81FXGx76xMXLPID5wg-hMXm2x9GKImc75S2POjyaM1ybQGA8ICZmUgUcOJt2hIfQCPulkQjE2rGEOPp6zbBzPCvXinE; sgid=02-35144799-AVuo9SEdkhQRYFdDtzBL72k; SNUID=3A94A629F3F6856D4D4A75AAF413208B; JSESSIONID=aaadNfym5dl1OgUejcHvw; SUV=006425A0DA5267C95BA8F38BC42D4423; IPLOC=CN3100; SUID=C96752DA2C12960A000000005BA8F38A; SUID=C96752DA3F18960A000000005BA8F389; ABTEST=0|1537799049|v1; weixinIndexVisited=1', "Host": 'weixin.sogou.com', "Upgrade-Insecure-Requests": 1, "Accept": 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', "Accept-Encoding": 'gzip, deflate', "Accept-Language": 'zh-CN,zh;q=0.9', "Cache-Control": 'no-cache', "Connection": 'keep-alive' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print("Get Proxy", response.text) return response.text return None except ConnectionError: return None def start(self): self.session.headers.update(self.headers) start_url = self.base_url + "?" + urlencode({"query": self.keyword, "type": 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) self.queue.add(weixin_request) def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print("Schedule", weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print("New Result", type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert("articles", result) else: self.error(weixin_request) else: self.error(weixin_request) sys.stdout.flush() def request(self, weixin_request): try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { "http": "http://" + proxy, "https": "https://" + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def parse_index(self, response): doc = pq(response.text) items = doc(".news-list .txt-box h3 a").items() for item in items: url = item.attr("href") yield WeixinRequest(url=url, callback=self.parse_detail) page = doc("#sogou_next").attr("href") if page: url = self.base_url + str(page) yield WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) def parse_detail(self, response): doc = pq(response.text) yield { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': re.search('publish_time = "(.*?)"', response.text).group(1), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } def error(self, weixin_request): weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def run(self): self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin?type=2&s_from=input&query=' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'CXID=D4E8EAA2564D90FBC285BAECF3078601; SUID=AEF72BAB5E68860A5B25128E00099457; SUV=00A00B5AAB2BF7AE5B2512C0E549F141; ad=Plllllllll2bJS3nlllllV7f1w9lllll$hsTrkllll9llllllZlll5@@@@@@@@@@; IPLOC=CN4403; ABTEST=0|1532415249|v1; SNUID=ABF409F180850E3945E004EF802B5D16; weixinIndexVisited=1; JSESSIONID=aaav6mr8OopuEawYdpHsw; ppinf=5|1532422896|1533632496|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTozOk5HVXxjcnQ6MTA6MTUzMjQyMjg5NnxyZWZuaWNrOjM6TkdVfHVzZXJpZDo0NDpvOXQybHVHN1RDWjZBU0E0TjgzM2JvaGs5aGJFQHdlaXhpbi5zb2h1LmNvbXw; pprdig=UAPBSEKKv9ua27yywPBP0BKxd4FAEtELVT8yK7dxy7N57B3yS-PA3M-C3d-VEOBxc-N-IIRP7khJM3Amnnol_WBt5RTD-V0pgVuxqNVf0EqfwLJwDWkiI3OA0-rCrBJrdOnCK0vj3IZvheDE1yjLjv-mdw0tv5MSeqlFOWZyhPk; sgid=22-36203267-AVtW6vAowj2ViaOuOVGM5Mto; ppmdig=153242289600000060a71b68c3ad711211acdaf46fde1281', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + self.keyword weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), #'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'SUID=329D9B273865860A598BD206000E05BC; ABTEST=0|1536152449|v1; IPLOC=CN1100; weixinIndexVisited=1; SUV=00334F7972F4B9F35B8FD385513C8523; SNUID=9111B04D3832425DCA568C3938B9B969; sct=2; JSESSIONID=aaaJrY-Zle4RBb7vUTBvw', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } session = Session() queue = RedisQueue() mysql = Mysql() def get_proxy(self): ''' 获取随机代理 :return: 随机代理 ''' proxy_url = 'http://127.0.0.1:8080/random' try: response = requests.get(proxy_url) if response.status_code == 200: print("GET Proxy:",response.text) return response.text return None except requests.ConnectionError: return None def start(self): ''' 开始第一个请求 :return: ''' self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({'type':'2','query':self.keyword}) weixin_request = WeixinRequest(start_url,self.parse_index,need_proxy=True) self.queue.add(weixin_request) def parse_index(self,response): ''' 解析索引页 :param response: 响应 :return: 新的请求 ''' doc = pq(response.text) items = doc('.news-box .news-list li txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url,callback=self.parse_detail) yield weixin_request # next = doc('#sogou_next') # if next: # url = self.base_url + next.attr('href') # weixin_request = WeixinRequest(url=url,callback=self.parse_index,need_proxy=True) # yield weixin_request def parse_detail(self,response): ''' 解析详情页 :param response: 响应 :return: 微信公众号文章 ''' doc = pq(response.text) data = { 'title':doc('.rich_media_title').text(), # 'content':doc('.rich_media_content').text(), 'date':doc('.rich_media_meta_list #publish_time').text(), 'nickname':doc('.rich_media_meta_list .rich_media_meta_text').text(), 'wechat':doc('.rich_media_meta_list .rich_media_meta_nickname').text() } print(data) yield data def request(self,weixin_request): ''' 执行请求 :param weixin_request: 请求 :return: 响应 ''' # try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http':'http://' + proxy, 'https':'http://'+proxy } return self.session.send(weixin_request.prepare(),timeout=weixin_request.timeout, allow_redirects=False,proxies=proxies,verify=False) return self.session.send(weixin_request.prepare(),timeout=weixin_request.timeout, allow_redirects=False,verify=False) # except Exception as e: # print(e.args) # return False def error(self,weixin_request): weixin_request.fail_time += 1 print('Request Failed',weixin_request.fail_time,'Times',weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print("Schedule",weixin_request.url) response = self.request(weixin_request) print(response) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: if isinstance(result,WeixinRequest): self.queue.add(result) if isinstance(result,dict): self.mysql.insert('articles',result) else: self.error(weixin_request) print(response) else: self.error(weixin_request) def run(self): self.start() self.schedule()
class Spider: base_url = "https://tiam.jp/post_ranking" session = Session() queue = RedisQueue() mysql = MySQL() #每次开启清除之前的队列 queue.clear() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print("Get Proxy", response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ start_url = self.base_url print(start_url) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=False) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.row.hsd-article').items() for item in items: url = item('a').attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail, need_proxy=True) #请求详情页不需要代理ip yield weixin_request next = doc('.pager_right a').attr('href') if next: url = next weixin_request = WeixinRequest( url=url, callback=self.parse_index, need_proxy=True #请求列表页需要代理ip ) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'actor': doc('div.thin_margin').text().split(' ')[0], 'title': doc('.hsd-article-title').text().strip(), 'video_url': doc('.hsd-video').attr('src'), 'click_times': doc('.date.low').text().split(' ')[0], 'release_date': doc('.date.low').text().split(' ')[1], 'category': doc('div.thin_margin').text(), 'image_url': doc('.hsd-product-detail-img').attr('src'), 'info': doc('.hsd-lead-info').text(), } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { "http": "http://" + proxy, "https": "https://" + proxy } return self.session.send( weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies, ) return self.session.send( weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, ) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print("Request Failed", weixin_request.fail_time, "Times", weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print("Schedule", weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print("New Result", type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert("tiam", result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'IPLOC=CN2102; SUID=9C73756F1F13940A000000005B1F2906; SUV=00F0227D6F75739C5B1F290662C32918; ABTEST=2|1529889370|v1; SNUID=345679DA0F0A7F62492989F810AAD276; weixinIndexVisited=1; ld=Jyllllllll2bG93illlllV7Zyr6lllllhXBpJlllll9lllllpylll5@@@@@@@@@@; LSTMV=0%2C0; LCLKINT=227; ppinf=5|1529891807|1531101407|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo5OiVFNyVBRCVCMXxjcnQ6MTA6MTUyOTg5MTgwN3xyZWZuaWNrOjk6JUU3JUFEJUIxfHVzZXJpZDo0NDpvOXQybHVHN2YxNzhjV2RRTS1FeFhqd0pTQnF3QHdlaXhpbi5zb2h1LmNvbXw; pprdig=mFU0I21BkcfTms9E36aUuv3z00DUZRBsL1tvbKyCkCzCpVhieM2YG-Q-zCFrX0Yp0hRzOMcLitg5lbi5rl4TfZb-voi6cug7on20l2thl5uYMoEwH_5NS9dEKUb4gYsW53BhZMc8g1Ceo2qR1mLpHzppFvMPZcnYsiqjTN6n4Ok; sgid=07-33580423-AVswS994vPgzNvLqqhTK3dQ; SUIR=CFAD9D26ECE98481C86D510FEC971D63; sct=4; ppmdig=1530100773000000c8ea0ae361dd1b6b84de1fec39f294b0;ad=nk91vlllll2b3z6OlllllV7ZyrolllllhXBpIkllll9lllllpylll5@@@@@@@@@@,CXID=26EC71D83938ABA9225220155EDA075A,ad=nk91vlllll2b3z6OlllllV7ZyrolllllhXBpIkllll9lllllpylll5@@@@@@@@@@, GOTO=Af99047,usid=ESlNvFQz9-gunhZe', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: 代理ip及端口 """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'IPLOC=CN1100; SUID=6FEDCF3C541C940A000000005968CF55; SUV=1500041046435211; ABTEST=0|1500041048|v1; SNUID=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; weixinIndexVisited=1; JSESSIONID=aaar_m7LEIW-jg_gikPZv; ld=Wkllllllll2BzGMVlllllVOo8cUlllll5G@HbZllll9lllllRklll5@@@@@@@@@@; LSTMV=212%2C350; LCLKINT=4650; ppinf=5|1500042908|1501252508|dHJ1c3Q6MToxfGNsaWVudGlkOjQ6MjAxN3x1bmlxbmFtZTo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8Y3J0OjEwOjE1MDAwNDI5MDh8cmVmbmljazo1NDolRTUlQjQlOTQlRTUlQkElODYlRTYlODklOEQlRTQlQjglQTglRTklOUQlOTklRTglQTclODV8dXNlcmlkOjQ0Om85dDJsdUJfZWVYOGRqSjRKN0xhNlBta0RJODRAd2VpeGluLnNvaHUuY29tfA; pprdig=ppyIobo4mP_ZElYXXmRTeo2q9iFgeoQ87PshihQfB2nvgsCz4FdOf-kirUuntLHKTQbgRuXdwQWT6qW-CY_ax5VDgDEdeZR7I2eIDprve43ou5ZvR0tDBlqrPNJvC0yGhQ2dZI3RqOQ3y1VialHsFnmTiHTv7TWxjliTSZJI_Bc; sgid=27-27790591-AVlo1pzPiad6EVQdGDbmwnvM; PHPSESSID=mkp3erf0uqe9ugjg8os7v1e957; SUIR=CEA85AE02A2F7E6EAFF9C1FE2ABEBE6F; sct=11; ppmdig=1500046378000000b7527c423df68abb627d67a0666fdcee; successCount=1|Fri, 14 Jul 2017 15:38:07 GMT', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print("Get Proxy", response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 :return: """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('.js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class Spider: base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'no-cache', 'Connection': 'keep-alive', 'Cookie': 'SUID=089C27B72423910A00000000587EF70E; SUV=00BA7A577157A2CD587EF716A8C08112; SUID=089C27B73020910A00000000587EF70E; CXID=2A1D8FB008EF5B378769F33E6517BC05; ABTEST=1|1543566907|v1; IPLOC=CN4403; JSESSIONID=aaaMubXXxB7JFgNDnA6Cw; weixinIndexVisited=1; sct=1; SNUID=E4EC78F24540395C82F3ABDD4616E1A3', 'Host': 'mp.weixin.qq.com', 'Pragma': 'no-cache', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + urlencode({ 'query': self.keyword, 'type': 2 }) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=True) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response: 响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') print(url) weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) print(url) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=True) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param response: 响应 :return: 微信公众号文章 """ doc = pq(response.text) # print('article detail', response.text) print('article detail', doc) data = { 'title': doc('h2.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_name').text(), 'wechat': doc('#js_name > div > p:nth-child(3) > span').text() } yield data def request(self, weixin_request): """ 执行请求 :param weixin_request: 请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def error(self, weixin_request): """ 错误处理 :param weixin_request: 请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: #self.queue.add(weixin_request) print('skip failed request') def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) print('Callback', callback) response = self.request(weixin_request) print('list', response) #print(response.url, ' : ', response.status_code) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) print('save data', result) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def run(self): """ 入口 :return: """ self.start() self.schedule()
class spider(object): base_url='http://weixin.sogou.com/weixin' headers={ 'Accept': 'image / webp, image / apng, image / *, * / *;q = 0.8', 'Accept - Encoding': 'gzip, deflate,UTF-8', 'Accept - Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 'Connection': 'keep - alive', 'Host': 'pb.sogou.com', 'Referer': 'http: // wx.sogou.com /', 'User - Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 YaBrowser/18.6.1.770 Yowser/2.5 Safari/537.36' } ruler=re.compile(r'var publish_time = "(\d+.\d+.\d+)') keywords='保加利亚妖王' session=requests.Session() queue=RedisQueue() mysql1=mysql() def get_proxy(self): """ 代理池的web接口获取代理 :return: """ try: response = requests.get(proxy_pool_web) if response.status_code == 200: print(response.text) return response.text return None except requests.ConnectionError: return None def start(self): """ 初始化 :return: """ self.session.headers.update(self.headers) start_url = self.base_url +'?'+ urlencode({'query':self.keywords,'type':2})# 不如直接formate weixin_request= weixinrequest(callback=self.parse_index,url=start_url,need_proxy=False)#构造第一个请求,以我们定义的类呈现 self.queue.add(weixin_request)#加入队列 def parse_index(self,response): response.encoding='utf-8' doc=etree.HTML(response.text) url_details=doc.xpath('//div[@class="txt-box"]/h3/a/@href') print('开始解析') print(url_details) #print(response.text) for i in url_details: #time.sleep(1) wei=weixinrequest(url=i,callback=self.parse_detail,need_proxy=True) print(wei.url) yield wei next_page=doc.xpath('//a[@id="sogou_next"]/@href') if next_page: time.sleep(2) url_nex=self.base_url + str(next_page[0]) wei_nex=weixinrequest(url=url_nex,callback=self.parse_index,need_proxy=False) yield wei_nex def parse_detail(self,reponse): time.sleep(1) reponse.encoding='utf-8' doc=pq(reponse.text) print("细节解析") data={ 'titles': doc('#activity-name').text(), 'contens':doc('#js_content').text(), 'date': self.ruler.findall(reponse.text)[0], 'wechat': doc('#js_name').text() } print(data) yield data def requesttt(self,wx): """ :param wx: prepare()方法,转化为prepared request请求 :return: 响应值,和request.get()得到的reponse差不多 """ try: if wx.need_proxy: proxy=self.get_proxy() proxies={ 'http': 'http://' + proxy, 'https': 'https://' + proxy#你需要对 body 或者 header (或者别的什么东西)做一些额外处理,,,,prepare() } return self.session.send(wx.prepare(),timeout=wx.timeout,allow_redirects=True,proxies=proxies) #.prepare()方法,把weixinrequest中封装的参数,进行预处理,结束后,send()出去,得到reaponse return self.session.send(wx.prepare(),timeout=wx.timeout,allow_redirects=True) except (requests.ConnectionError,requests.ConnectTimeout) as e: print(e.args) def error(self,wx): wx.fail_time=wx.fail_time +1 print('request failed %s times %s '%(wx.fail_time,wx.url)) if wx.fail_time <= Max_failed_time: self.queue.add(wx) def scheduler(self): while not self.queue.empty(): wx=self.queue.pop() callback=wx.callback print('schedule %s'% wx.url) rseponse=self.requesttt(wx) if rseponse and rseponse.status_code in Valid_statues: print('zhix') results = list(callback(rseponse)) if results: for result in results: if isinstance(result,weixinrequest): self.queue.add(result) if isinstance(result,dict): self.mysql1.insert('articles',result) else: self.error(wx) else: self.error(wx) def run(self): self.start() self.scheduler()
class Spider(): base_url = 'http://weixin.sogou.com/weixin' keyword = 'NBA' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'IPLOC=CN3100; SUV=006058CA7258092E5A365AE5C31C4119; usid=7owwSvanBFp34s-d; ABTEST=8|1535070343|v1; SUID=4AFC5A653F18960A000000005B7F5087; SUID=4AFC5A652E18960A000000005B7F5087; weixinIndexVisited=1; SNUID=76C7665E3B39481850181AC33C52A601; sct=3; JSESSIONID=aaaDu_x19Td39bcXoOBvw', 'Host': 'weixin.sogou.com', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36' } session = Session() queue = RedisQueue() mysql = MySQL() def get_proxy(self): """ 从代理池获取代理 :return: """ try: response = requests.get(PROXY_POOL_URL) if response.status_code == 200: print('Get Proxy', response.text) return response.text except requests.ConnectionError: return None def start(self): """ 初始化工作 """ # 全局更新Headers self.session.headers.update(self.headers) start_url = self.base_url + '?' + \ urlencode({'query': self.keyword, 'type': 2}) weixin_request = WeixinRequest(url=start_url, callback=self.parse_index, need_proxy=False) # 调度第一个请求 self.queue.add(weixin_request) def parse_index(self, response): """ 解析索引页 :param response:响应 :return: 新的响应 """ doc = pq(response.text) items = doc('.news-box .news-list li .txt-box h3 a').items() for item in items: url = item.attr('href') weixin_request = WeixinRequest(url=url, callback=self.parse_detail) yield weixin_request next = doc('#sogou_next').attr('href') if next: url = self.base_url + str(next) weixin_request = WeixinRequest(url=url, callback=self.parse_index, need_proxy=False) yield weixin_request def parse_detail(self, response): """ 解析详情页 :param responose:响应 :return: 微信公众号文章 """ doc = pq(response.text) data = { 'title': doc('.rich_media_title').text(), 'content': doc('.rich_media_content').text(), 'date': doc('#post-date').text(), 'nickname': doc('#js_profile_qrcode > div > strong').text(), 'wechat': doc('#js_profile_qrcode > div > p:nth-child(3) > span').text() } yield data def error(self, weixin_request): """ 错误处理 :param weixin_request:请求 :return: """ weixin_request.fail_time = weixin_request.fail_time + 1 print('Request Failed', weixin_request.fail_time, 'Times', weixin_request.url) if weixin_request.fail_time < MAX_FAILED_TIME: self.queue.add(weixin_request) def schedule(self): """ 调度请求 :return: """ while not self.queue.empty(): weixin_request = self.queue.pop() callback = weixin_request.callback print('Schedule', weixin_request.url) response = self.request(weixin_request) print('Response', response) if response and response.status_code in VALID_STATUSES: results = list(callback(response)) if results: for result in results: print('New Result', type(result)) if isinstance(result, WeixinRequest): self.queue.add(result) if isinstance(result, dict): self.mysql.insert('articles', result) else: self.error(weixin_request) else: self.error(weixin_request) def request(self, weixin_request): """ 执行请求 :param weixin_request:请求 :return: 响应 """ try: if weixin_request.need_proxy: proxy = self.get_proxy() if proxy: proxies = { 'http': 'http://' + proxy, 'https:': 'https://' + proxy } return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False, proxies=proxies) return self.session.send(weixin_request.prepare(), timeout=weixin_request.timeout, allow_redirects=False) except (ConnectionError, ReadTimeout) as e: print(e.args) return False def run(self): """ 入口 :return: """ self.start() self.schedule()