def process_exception(self, request, exception, spider): try: proxy_pool.add_failed_time(request.meta['proxy'].replace( 'http://', '')) except Exception as e: log.error(e) pass
def parse_content(self, response): """ 解析文章内容 """ log.info(('Begin parseContent ' + response.url)) item = MerchantItem() item['updated_at'] = int(time.time()) item['url'] = response.url item['area'] = response.url.split('/')[4] item['merchant_id'] = response.url.split('/')[-2] try: item['merchant_name'] = response.xpath( '//span[@id="shop_name_val"]/text()')[0].extract() item['company_profile'] = response.xpath( '//div[@class="i-txt"]/span[@class="s-con"]/text()' )[0].extract() item['service_area'] = \ response.xpath('//div[@class="des"]/div[@class="item-des clearfix"]/div[@class="i-txt i-dTxt"]/text()')[ 0].extract() item['merchant_pic'] = response.urljoin( response.xpath('//div[@class="pic"]/img/@src')[0].extract()) yield item except Exception as e: # log.warn("-----------------------获取到内容:" + response.text + "------------------------------") log.warn("spider error %s ( refer: %s )" % (e, response.url)) log.error(e) if configs.USE_PROXY: proxy_pool.add_failed_time(response.meta['proxy'].replace( 'http://', ''))
def handle_item(self, item: CityItem): log.info('process item from worm url = ' + item['url']) if isinstance(item, CityItem): session = self.session() model = CityDO() model.name = item['name'] model.url = item['url'] model.create_at = item['create_at'] try: m = session.query(CityDO).filter( CityDO.url == model.url).first() if m is None: # 插入数据 log.info('add model from worm url = ' + model.url) session.add(model) session.flush() session.commit() log.info('spider_success url = ' + model.url) except Exception as error: session.rollback() log.error(error) raise finally: session.close() return item
def get_client(host: str, port: int) -> MongoClient: try: client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE) log.info("Connected successfully!!!") return client except errors.ConnectionFailure as e: log.error(e)
def process_request(self, request, spider): try: if 'splash' in request.meta: return request.meta[ 'proxy'] = "http://%s" % proxy_pool.random_choice_proxy(False) except Exception as e: log.error(e)
def process_request(self, request, spider): try: log.info("==== proxy = " + proxy_pool.random_choice_proxy() + " ====") request.meta[ 'proxy'] = "http://%s" % proxy_pool.random_choice_proxy() except Exception as e: log.error(e)
def process_request(self, request, spider): try: if 'splash' not in request.meta: return request.meta['splash']['args'][ 'proxy'] = "http://%s" % proxy_pool.random_choice_proxy(False) print("using proxy:" + request.meta['splash']['args']['proxy']) except Exception as e: log.error(e)
def find_all(self): session = self.session() try: city_list = session.query(CityDO).all() return city_list except Exception as e: log.error(e) finally: session.close()
def db_connect_engine(): utils.log('db_connect_engine') engine = create_engine( "%s://%s:%s@%s:%s/%s?charset=utf8mb4" % (DATABASES['DRIVER'], DATABASES['USER'], DATABASES['PASSWORD'], DATABASES['HOST'], DATABASES['PORT'], DATABASES['NAME']), echo=False) try: if not database_exists(engine.url): create_database(engine.url) # 创建库 Base.metadata.create_all(engine) # 创建表 except Exception as e: log.error(e) return engine
def process_response(self, request, response, spider): print("CatchException:" + request.url + " " + str(response.status)) if response.status < 200 or response.status >= 400: try: if 'splash' not in request.meta: proxy_pool.add_failed_time(request.meta['proxy'].replace( 'http://', '')) else: proxy_pool.add_failed_time( request.meta['splash']['args']['proxy'].replace( 'http://', '')) except KeyError: pass except Exception as e: log.error(e) return response
def handle_item(self, item: MerchantItem): log.info('process item from worm url = ' + item['url']) if isinstance(item, MerchantItem): session = self.session() model = MerchantDO() model.updated_at = item['updated_at'] model.merchant_name = item['merchant_name'] model.company_profile = item['company_profile'] model.service_area = item['service_area'] model.merchant_pic = item['merchant_pic'] model.merchant_id = item['merchant_id'] model.url = item['url'] model.area = item['area'] try: m = session.query(MerchantDO).filter( MerchantDO.url == model.url).first() if m is None: # 插入数据 log.info('add model from worm url = ' + model.url) session.add(model) session.flush() session.commit() log.info('spider_success url = ' + model.url) # else: # 更新数据 # log.info("update model from gp url " + model.url) # m.updated_at = item['updated_at'] # m.merchant_name = item['merchant_name'] # m.merchant_pic = item['merchant_pic'] # m.service_area = item['service_area'] # m.merchant_id = item['merchant_id'] # m.company_profile = item['company_profile'] # m.area = item['area'] # m.url = item['url'] except Exception as error: session.rollback() log.error(error) raise finally: session.close() return item
def update_one(self, collection, condition: dict, value: dict): try: return collection.update_one(condition, {"$set": value}) except Exception as e: log.error(e)
def find_one(self, collection, condition: dict): try: return collection.find_one(condition) except Exception as e: log.error(e)
def save_to_database(self, collection, item): try: mongodb_service.insert(collection, item.__dict__) except Exception as e: log.error(e)
def get_db(client: MongoClient, db_name: str) -> Database: try: db = Database(client, db_name) return db except Exception as e: log.error(e)