def extract_and_fill(self, item, data): html = decodeHtml(data) #转换到utf8编码 info = parseHtml(html) #解析 item['publish_datetime'] = info['datetime'] item['title'] = info['title'] item['content'] = info['text']
def parse_detail(self, response): url = response.url item = CrawlItem( url=url, site=self.site_name, crawl_datetime=datetime.datetime.now(), uuid=hashlib.md5(url).hexdigest() ) # 数据 data = response.body # 转换到utf8编码 html = decodeHtml(data) # 解析 info = parseHtml(html) item["publish_datetime"] = info["datetime"] item["title"] = info["title"] item["content"] = info["text"] return item
def getHtml(url): opener = urllib2.urlopen(url) data = opener.read() html = decodeHtml(data) return html