Пример #1
0
 def process_exception(self, request, exception, spider):
     try:
         proxy_pool.add_failed_time(request.meta['proxy'].replace(
             'http://', ''))
     except Exception as e:
         log.error(e)
         pass
Пример #2
0
    def parse_content(self, response):
        """
            解析文章内容
        """
        log.info(('Begin parseContent ' + response.url))

        item = MerchantItem()

        item['updated_at'] = int(time.time())
        item['url'] = response.url
        item['area'] = response.url.split('/')[4]
        item['merchant_id'] = response.url.split('/')[-2]

        try:
            item['merchant_name'] = response.xpath(
                '//span[@id="shop_name_val"]/text()')[0].extract()
            item['company_profile'] = response.xpath(
                '//div[@class="i-txt"]/span[@class="s-con"]/text()'
            )[0].extract()

            item['service_area'] = \
                response.xpath('//div[@class="des"]/div[@class="item-des clearfix"]/div[@class="i-txt i-dTxt"]/text()')[
                    0].extract()

            item['merchant_pic'] = response.urljoin(
                response.xpath('//div[@class="pic"]/img/@src')[0].extract())
            yield item
        except Exception as e:
            # log.warn("-----------------------获取到内容:" + response.text + "------------------------------")
            log.warn("spider error %s ( refer: %s )" % (e, response.url))
            log.error(e)
            if configs.USE_PROXY:
                proxy_pool.add_failed_time(response.meta['proxy'].replace(
                    'http://', ''))
Пример #3
0
    def handle_item(self, item: CityItem):
        log.info('process item from worm url = ' + item['url'])

        if isinstance(item, CityItem):

            session = self.session()

            model = CityDO()
            model.name = item['name']
            model.url = item['url']
            model.create_at = item['create_at']

            try:
                m = session.query(CityDO).filter(
                    CityDO.url == model.url).first()

                if m is None:  # 插入数据
                    log.info('add model from worm url = ' + model.url)
                    session.add(model)
                    session.flush()
                    session.commit()
                    log.info('spider_success url = ' + model.url)

            except Exception as error:
                session.rollback()
                log.error(error)
                raise
            finally:
                session.close()
        return item
def get_client(host: str, port: int) -> MongoClient:
    try:
        client = MongoClient(host, port, maxPoolSize=MAX_POOL_SIZE)
        log.info("Connected successfully!!!")
        return client
    except errors.ConnectionFailure as e:
        log.error(e)
Пример #5
0
 def process_request(self, request, spider):
     try:
         if 'splash' in request.meta:
             return
         request.meta[
             'proxy'] = "http://%s" % proxy_pool.random_choice_proxy(False)
     except Exception as e:
         log.error(e)
Пример #6
0
 def process_request(self, request, spider):
     try:
         log.info("==== proxy = " + proxy_pool.random_choice_proxy() +
                  "  ====")
         request.meta[
             'proxy'] = "http://%s" % proxy_pool.random_choice_proxy()
     except Exception as e:
         log.error(e)
Пример #7
0
 def process_request(self, request, spider):
     try:
         if 'splash' not in request.meta:
             return
         request.meta['splash']['args'][
             'proxy'] = "http://%s" % proxy_pool.random_choice_proxy(False)
         print("using proxy:" + request.meta['splash']['args']['proxy'])
     except Exception as e:
         log.error(e)
Пример #8
0
    def find_all(self):
        session = self.session()

        try:
            city_list = session.query(CityDO).all()
            return city_list
        except Exception as e:
            log.error(e)
        finally:
            session.close()
Пример #9
0
def db_connect_engine():
    utils.log('db_connect_engine')
    engine = create_engine(
        "%s://%s:%s@%s:%s/%s?charset=utf8mb4" %
        (DATABASES['DRIVER'], DATABASES['USER'], DATABASES['PASSWORD'],
         DATABASES['HOST'], DATABASES['PORT'], DATABASES['NAME']),
        echo=False)
    try:
        if not database_exists(engine.url):
            create_database(engine.url)  # 创建库
            Base.metadata.create_all(engine)  # 创建表
    except Exception as e:
        log.error(e)
    return engine
Пример #10
0
 def process_response(self, request, response, spider):
     print("CatchException:" + request.url + " " + str(response.status))
     if response.status < 200 or response.status >= 400:
         try:
             if 'splash' not in request.meta:
                 proxy_pool.add_failed_time(request.meta['proxy'].replace(
                     'http://', ''))
             else:
                 proxy_pool.add_failed_time(
                     request.meta['splash']['args']['proxy'].replace(
                         'http://', ''))
         except KeyError:
             pass
         except Exception as e:
             log.error(e)
     return response
Пример #11
0
    def handle_item(self, item: MerchantItem):
        log.info('process item from worm url = ' + item['url'])

        if isinstance(item, MerchantItem):

            session = self.session()

            model = MerchantDO()
            model.updated_at = item['updated_at']
            model.merchant_name = item['merchant_name']
            model.company_profile = item['company_profile']
            model.service_area = item['service_area']
            model.merchant_pic = item['merchant_pic']
            model.merchant_id = item['merchant_id']
            model.url = item['url']
            model.area = item['area']

            try:
                m = session.query(MerchantDO).filter(
                    MerchantDO.url == model.url).first()

                if m is None:  # 插入数据
                    log.info('add model from worm url = ' + model.url)
                    session.add(model)
                    session.flush()
                    session.commit()
                    log.info('spider_success url = ' + model.url)

                # else:  # 更新数据
                #    log.info("update model from gp url " + model.url)
                #    m.updated_at = item['updated_at']
                #    m.merchant_name = item['merchant_name']
                #    m.merchant_pic = item['merchant_pic']
                #    m.service_area = item['service_area']
                #    m.merchant_id = item['merchant_id']
                #    m.company_profile = item['company_profile']
            #     m.area = item['area']
            #    m.url = item['url']

            except Exception as error:
                session.rollback()
                log.error(error)
                raise
            finally:
                session.close()
        return item
Пример #12
0
 def update_one(self, collection, condition: dict, value: dict):
     try:
         return collection.update_one(condition, {"$set": value})
     except Exception as e:
         log.error(e)
Пример #13
0
 def find_one(self, collection, condition: dict):
     try:
         return collection.find_one(condition)
     except Exception as e:
         log.error(e)
Пример #14
0
 def save_to_database(self, collection, item):
     try:
         mongodb_service.insert(collection, item.__dict__)
     except Exception as e:
         log.error(e)
def get_db(client: MongoClient, db_name: str) -> Database:
    try:
        db = Database(client, db_name)
        return db
    except Exception as e:
        log.error(e)