def _init_start_requests(self, start_evt): """ init start requests :return: """ logger.info("start crawling !") for req in self.start_requests: # print(req) self.crawl(req) time.sleep(1) start_evt.set()
def process_item(self, item, spider): """ :param item: :param spider: :return: """ #TODO:写到调度器里面去 # pass #调用收集线程 logger.info("Process_item:\n \t {item}".format(item=item))
def fetch(self, request): # kwargs = { # "headers": request.headers, # "timeout": self.settings["TIMEOUT"], # "proxies": # } # timeout = self.settings["TIMEOUT"] # request.headers.update(header) # req = Request( # method=request.method, # url=request.url, # data=request.data, # headers=request.headers # ) # print(request.headers) url = request.url meta = request.meta # pre = self._get_session(url) # session = self._get_session(url) if request.method == 'POST': response = session.post(url, data=request.data, headers=request.headers, proxies=meta.get('proxy'), timeout=meta['timeout']) else: response = session.get(url, data=request.data, headers=request.headers, proxies=meta.get('proxy'), timeout=meta['timeout']) # prepped = session.prepare_request(req) # # logger.info("processing %s", url) # response = session.send(prepped, # proxies=meta.get('proxy'), # timeout=timeout # ) # print(len(response.text)) # print(response.text) r = Response(response.url, response.status_code, response.headers, response.content, response.text) logger.info("Downloaded ({status}) {request}".format( request=str(request), status=r.status)) return r
def close_spider(self, start_evt, close_evt): """ 关闭爬虫 对爬虫队列不断检查 我的思路是如果为空的话往队列里面放入flag 通过这个flag关闭线程 :return: """ # time.sleep(2) start_evt.wait() # wait 直到他set() while self.running: time.sleep(.1) if len(self.scheduler) == 0: close_evt.set() self.running = False logger.info("close spider !")
def get_blog_list(self, response): response_dict = json.loads(response.text) mainodors = response_dict['mainodor'] for mianodor in mainodors: # print(mianodor) sql = """ insert raw_item_smell_rank value('%s','%s','%s')""" % (response.meta['item_id'], mianodor['uoodor'], mianodor['cnt']) try: conn = self.get_conn() conn.cursor().execute(sql) conn.commit() logger.info("insert OK!" + str(mianodor['uoodor'])) except Exception as e: logger.error("Error: %s", str(e), exc_info=True)