def __init__(self): # 连接数据库 try: engine = create_engine('mysql://%s:%s@%s:%s/%s?charset=utf8' % (cp.get('db', 'user'), cp.get('db', 'pwd'), cp.get('db', 'host'), cp.get( 'db', 'port'), cp.get('db', 'dbname')), echo=False) DBSession = sessionmaker(bind=engine) self.metadata = MetaData(engine) self.db_session = DBSession() except Exception as e: raise e # 创建表 create_table(engine) self.ipTool = IPProxyTool() self.ipTool.destIP = 'https://www.dianping.com' self.ipTool.refresh() self.thread = threading.Thread(target=self.schedule) self.thread.setDaemon(True) self.thread.start()
class DianpingSpiderMiddleware(object): # Not all methods need to be defined. If a method is not defined, # scrapy acts as if the spider middleware does not modify the # passed objects. def __init__(self): self.ipTool = IPProxyTool() super(DianpingSpiderMiddleware, self).__init__() @classmethod def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. s = cls() crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) return s def process_spider_input(self, response, spider): # Called for each response that goes through the spider # middleware and into the spider. # Should return None or raise an exception. return None def process_spider_output(self, response, result, spider): # Called with the results returned from the Spider, after # it has processed the response. # Must return an iterable of Request, dict or Item objects. for i in result: yield i def process_spider_exception(self, response, exception, spider): # Called when a spider or process_spider_input() method # (from other spider middleware) raises an exception. # Should return either None or an iterable of Response, dict # or Item objects. pass def process_start_requests(self, start_requests, spider): # Called with the start requests of the spider, and works # similarly to the process_spider_output() method, except # that it doesn’t have a response associated. # Must return only requests (not items). for r in start_requests: yield r def process_request(self, request, spider): request.meta['proxy'] = 'http://' + choice( self.ipTool.getIPs()) if len(self.ipTool.getIPs()) != 0 else None Logging.debug('{(request_url)%s : (proxy)%s}' % (request.url, request.meta['proxy'])) def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name)
def request_ticket_info(self): Logging.debug('<---------- 请求车票信息 ---------->') train_infos = [] for url in self.request_ticket_urls: proxies = [] for ip in IPProxyTool().getIPs(): proxies.append(dict(http='http://' + ip, https='http://' + ip)) print url if len(proxies) != 0: response = self.session.get(url, verify=False, proxies=choice(proxies)) else: response = self.session.get(url, verify=False) try: if response.status_code == 200: # Logging.debug(requests.utils.dict_from_cookiejar(self.session.cookies)) self.session.cookies.save(ignore_discard=True) info_json = response.json() result = info_json['data']['result'] train_infos.append(result) else: Logging.warning(response.text.encode('u8')) continue except Exception as e: Logging.debug(response.text) Logging.warning(e.message) continue return self.parse_result(train_infos)
def process_exception(self, request, exception, spider): if isinstance(exception, self.EXCEPTIONS_TO_RETRY) and not request.meta.get( 'dont_retry', False): Logging.warning( '<---------- {(url)%s (retry_time):%s} ---------->' % (request.url, request.meta.get('retry_times', 0))) if request.meta.get('retry_times', 0) == 2: request.meta['proxy'] = None elif request.meta.get('retry_times', 0) == 3: f = open('fail_urls.txt', 'a+') f.write(request.url + '\n') f.close() else: request.meta['proxy'] = 'http://' + choice( IPProxyTool().getIPs()) if len( IPProxyTool().getIPs()) != 0 else None request = request.replace( url=request.url.split('?')[0]) if len( request.url.split('?')) > 0 else None return self._retry(request, exception, spider)
def request_proxy(self, min_ip_count=5): IPProxyTool().destIP = INITURL IPProxyTool().refresh(min_ip_count=min_ip_count)
class DianpingPipeline(object): def __init__(self): # 连接数据库 try: engine = create_engine('mysql://%s:%s@%s:%s/%s?charset=utf8' % (cp.get('db', 'user'), cp.get('db', 'pwd'), cp.get('db', 'host'), cp.get( 'db', 'port'), cp.get('db', 'dbname')), echo=False) DBSession = sessionmaker(bind=engine) self.metadata = MetaData(engine) self.db_session = DBSession() except Exception as e: raise e # 创建表 create_table(engine) self.ipTool = IPProxyTool() self.ipTool.destIP = 'https://www.dianping.com' self.ipTool.refresh() self.thread = threading.Thread(target=self.schedule) self.thread.setDaemon(True) self.thread.start() def process_item(self, item, spider): cuisines = item['cuisine'] names = item['name'] shop_url = item['shop_url'] stars = item['star'] comment_counts = item['comment_count'] avg_prices = item['avg_price'] districts = item['district'] streets = item['street'] update_times = item['update_time'] for index in range(len(names)): model = DianPing(cuisine=cuisines[index], shop_url=shop_url[index], name=names[index], star=stars[index], comment_count=comment_counts[index], avg_price=avg_prices[index], district=districts[index], street=streets[index], update_time=update_times[index]) self.db_session.add(model) self.db_session.commit() return item def close_spider(self, spider): self.db_session.close() def schedule(self): scheduler = BlockingScheduler() scheduler.add_job(self.ipTool.refresh, 'interval', seconds=300) scheduler.start()
def __init__(self): self.ipTool = IPProxyTool() super(DianpingSpiderMiddleware, self).__init__()