Пример #1
0
 def get_content(self, response):
         key = response.meta['key']
         page = response.meta['page']
         url = response.meta['url']
         url_time =response.meta['url_time']
         title = response.meta['title']
         json_result=json.loads(response.text)
         bulletincontent=json_result['data']['data'][0]['bulletincontent']
         content=html_to_plain_text(bulletincontent)
         self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,开始抽取网页:{} 的详情内容".format(key, page, url))
         # time.sleep(self.wait_time)
         self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,网页:{} 的详情内容开始填充item".format(key, page, url))
         item = BiddingItem()
         item['url'] = url
         item['url_time'] = url_time
         item['source'] = '江苏招标投标公共服务平台'
         html = compress_string_and_base64_encode(content)
         item['origin_length'] = html[0]
         item['compressed_html'] = html[1]
         item['compressed_length'] = html[2]
         url_title = compress_string_and_base64_encode(title)
         item['title_origin_length'] = url_title[0]
         item['title_compressed_html'] = url_title[1]
         item['title_compressed_length'] = url_title[2]
         # self.bidding_digest_db.putSha1Digest(item['url'], item['url_time'])
         self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,网页:{} 的详情内容item即将提交".format(key, page, url))
         yield item
Пример #2
0
 def get_content(self, response):
     key = response.meta['key']
     page = response.meta['page']
     url = response.meta['url']
     url_time = response.meta['url_time']
     title = response.meta['title']
     content = html_to_plain_text(response.text)
     self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,开始抽取网页:{} 的详情内容".format(key, page, url))
     self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,网页:{} 的详情内容开始填充item".format(key, page, url))
     item = BiddingItem()
     item['url'] = url
     item['url_time'] = url_time
     item['source'] = '全国公共资源交易平台'
     html = compress_string_and_base64_encode(content)
     item['origin_length'] = html[0]
     item['compressed_html'] = html[1]
     item['compressed_length'] = html[2]
     url_title = compress_string_and_base64_encode(title)
     item['title_origin_length'] = url_title[0]
     item['title_compressed_html'] = url_title[1]
     item['title_compressed_length'] = url_title[2]
     # self.bidding_digest_db.putSha1Digest(item['url'], item['url_time'])
     self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,网页:{} 的详情内容item即将提交".format(key, page, url))
     yield item