def run_job(self):
     Logger().info('product_consumer start')
     self.http = Http()
     self.proxy_engine = get_proxy_engine()
     self.http.set_headers(self.headers)
     while True:
         job_dict = self.get_job_obj()
         if job_dict:
             job_entity = ProductAddJobEntity.instance(job_dict)
             try:
                 if self.proxy_engine:
                     # product 反扒比较苛刻,这边用了随机IP的代理
                     self.http.set_proxy(self.proxy_engine.get_proxy())
                 crawler = ProductAddCrawler(job_entity, self.http)
                 if crawler.productItem:
                     job_dict['product_item_id'] = crawler.productItem.id
                     new_job = ProductJobEntity.instance(job_dict)
                     self.set_job_by_key(RedisListKeyEnum.product_crawl_job, new_job)
             except CrawlErrorException:
                 # 爬虫失败异常,http 连续失败次数+1
                 self.set_error_job(job_entity)
             except NotFoundException:
                 # 页面不存在,不做处理
                 pass
             common.sleep_random()
示例#2
0
def new_consumer(group_id):
    consumer = KafkaConsumer('orders',
                             group_id=group_id,
                             auto_offset_reset='smallest')

    for msg in consumer:
        common.sleep_random()
        json_data = json.loads(msg.value)
        common.prGreen(
            "\nService %s got a message: -- Msg Key: %s, partition: [%s] offset [%s]:"
            % (group_id, msg.key, msg.partition, msg.offset))
        pprint.pprint(json_data, indent=1, width=40)
示例#3
0
 def run_job(self):
     Logger().info('product_consumer start')
     self.http = Http()
     self.proxy_engine = get_proxy_engine()
     self.http.set_headers(self.headers)
     while True:
         job_dict = self.get_job_obj()
         if job_dict:
             job_entity = ProductJobEntity.instance(job_dict)
             try:
                 if self.proxy_engine:
                     # product 反扒比较苛刻,这边用了随机IP的代理
                     self.http.set_proxy(self.proxy_engine.get_proxy())
                 ProductCrawler(job_entity, self.http)
             except CrawlErrorException:
                 # 爬虫失败异常,http 连续失败次数+1
                 self.set_error_job(job_entity)
             except NotFoundException:
                 # 页面不存在,不做处理
                 pass
             common.sleep_random()
示例#4
0
 def run_job(self):
     Logger().info('product_review_consumer start')
     self.http = Http()
     self.proxy_engine = get_proxy_engine()
     self.http.set_headers(self.headers)
     while True:
         job_dict = self.get_job_obj()
         if job_dict:
             job_entity = ProductReviewJobEntity.instance(job_dict)
             try:
                 if self.proxy_engine:
                     self.http.set_proxy(self.proxy_engine.get_proxy())
                 crawl = ProductReviewCrawler(job_entity, self.http)
                 if crawl.crawl_next_page:
                     job_entity.page += 1
                     self.set_job(job_entity)
             except CrawlErrorException:
                 # 爬虫失败异常,http 连续失败次数+1
                 self.set_error_job(job_entity)
             except NotFoundException:
                 # 页面不存在,不做处理
                 pass
             common.sleep_random()
示例#5
0
def callbackstore(ch, method, properties, body):
    common.sleep_random()
    print_method("store", method.routing_key)
    common.print_json(body)
    ch.basic_ack(delivery_tag=method.delivery_tag)
示例#6
0
def start_store_consumer():
    common.sleep_random()
    print("Service for \" store \" started")
    common.divide()
    new_consumer("store")
示例#7
0
def start_clothes_consumer():
    common.sleep_random()
    print("Service for \"clothes\" started")
    common.divide()
    new_consumer("clothes")
示例#8
0
def start_mail_consumer():
    common.sleep_random()
    print("Service for \"mail\" started")
    common.divide()
    new_consumer("mail")