def get_content(self, response): key = response.meta['key'] page = response.meta['page'] url = response.meta['url'] url_time =response.meta['url_time'] title = response.meta['title'] json_result=json.loads(response.text) bulletincontent=json_result['data']['data'][0]['bulletincontent'] content=html_to_plain_text(bulletincontent) self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,开始抽取网页:{} 的详情内容".format(key, page, url)) # time.sleep(self.wait_time) self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,网页:{} 的详情内容开始填充item".format(key, page, url)) item = BiddingItem() item['url'] = url item['url_time'] = url_time item['source'] = '江苏招标投标公共服务平台' html = compress_string_and_base64_encode(content) item['origin_length'] = html[0] item['compressed_html'] = html[1] item['compressed_length'] = html[2] url_title = compress_string_and_base64_encode(title) item['title_origin_length'] = url_title[0] item['title_compressed_html'] = url_title[1] item['title_compressed_length'] = url_title[2] # self.bidding_digest_db.putSha1Digest(item['url'], item['url_time']) self.tml_logger.info("【bidding_jiangsuzb】关键词{}在第{}页,网页:{} 的详情内容item即将提交".format(key, page, url)) yield item
def get_content(self, response): key = response.meta['key'] page = response.meta['page'] url = response.meta['url'] url_time = response.meta['url_time'] title = response.meta['title'] content = html_to_plain_text(response.text) self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,开始抽取网页:{} 的详情内容".format(key, page, url)) self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,网页:{} 的详情内容开始填充item".format(key, page, url)) item = BiddingItem() item['url'] = url item['url_time'] = url_time item['source'] = '全国公共资源交易平台' html = compress_string_and_base64_encode(content) item['origin_length'] = html[0] item['compressed_html'] = html[1] item['compressed_length'] = html[2] url_title = compress_string_and_base64_encode(title) item['title_origin_length'] = url_title[0] item['title_compressed_html'] = url_title[1] item['title_compressed_length'] = url_title[2] # self.bidding_digest_db.putSha1Digest(item['url'], item['url_time']) self.tml_logger.info("【bidding_jszfcg】关键词{}在第{}页,网页:{} 的详情内容item即将提交".format(key, page, url)) yield item