def crawl(self): key = self.key data = self.data homepage = "http://card.weibo.com/article/aj/articleshow?cid=" + key url = "http://weibo.com/p/" + key html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) html_stream = json_stream['data']['article'] soup = HandleContent.get_BScontext(html_stream, text=True) title = soup.select('.title')[0].text pubtime = soup.select('.time')[0].text pubtime = HandleContent.strformat(str(pubtime)) content = soup.select('.WBA_content')[0] content = clear_label(list(content)) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) publishers = soup.select('.S_link2') # author = reduce(lambda x, y: x + y, [item.text for item in authors]) try: publisher = publishers[1].text if len( publishers) > 1 else publishers[0].text except: publisher = '' crawl_data = {} date = new_time() crawl_data = { 'title': title, 'pubtime': pubtime, 'source': 'weibo', 'publisher': publisher, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'origin_source': u'微博搜索', 'url': url, 'key': data.get('key', ''), 'type': u'元搜索', 'source_type': data.get('source_type', ''), 'content': content, 'comment': comment, } model = SearchArticleModel(crawl_data) export(model)
def crawl(self): key = self.key data = self.data homepage = "http://card.weibo.com/article/aj/articleshow?cid="+ key url = "http://weibo.com/p/"+ key html_stream = _get_url(homepage) json_stream = change_to_json(str(html_stream.text)) html_stream = json_stream['data']['article'] soup = HandleContent.get_BScontext(html_stream, text=True) title = soup.select('.title')[0].text pubtime = soup.select('.time')[0].text pubtime = HandleContent.strformat(str(pubtime)) content = soup.select('.WBA_content')[0] content = clear_label(list(content)) comment = {} text = HandleContent.get_BScontext(content, text=True).text comment['content'] = clear_space(text) publishers = soup.select('.S_link2') # author = reduce(lambda x, y: x + y, [item.text for item in authors]) try: publisher = publishers[1].text if len(publishers)> 1 else publishers[0].text except: publisher = '' crawl_data = {} date = new_time() crawl_data = { 'title': title, 'pubtime': pubtime, 'source': 'weibo', 'publisher': publisher, 'crtime_int': date.get('crtime_int'), 'crtime': date.get('crtime'), 'origin_source': u'微博搜索', 'url': url, 'key': data.get('key', ''), 'type': u'元搜索', 'source_type': data.get('source_type', ''), 'content': content, 'comment': comment, } model = SearchArticleModel(crawl_data) export(model)