def google_rank_new_by_html(uid, html): jpg = txt_wrap_by( 'height="200" src="//', 'photo.jpg?sz=200', html ) jpg = '//%sphoto.jpg'%jpg #?sz=200 follower = txt_wrap_by( '(', ')', txt_wrap_by( '>圈子中有', '</h4>', html ) ) if follower: follower = follower.replace(',', '') if not follower.isdigit(): follower = 0 else: follower = 0 name = txt_wrap_by('<title>', '</title>', html).rsplit(' - ')[0] txt = txt_wrap_by( """介绍</h2><div """, '''</div></div><div class="''', html ) if txt: txt = txt[txt.find('note">')+6:].replace('</div>', ' ').replace('<div>', ' ').replace('<span>', '').replace('</span>', '').strip() return google_rank_new(uid, follower, jpg, name, txt)
def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>','- 知乎',html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html) replies = [ htm2txt(x)[0] for x in reply_raw_list ] js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html) a = loads(js) answer_list=[] question_info={} question_info['answer'] = answer_list question_info['tags'] = [ x[0] for x in a[1][3] ] question_info['title'] = title question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0] replies_line = zip(a[1][12],replies) for x in replies_line: try: new_ans={} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info)+'\n')
def title(self, data): title = txt_wrap_by( '<tr><td class="tablelc"></td><td class="tablecc"><strong>标题:</strong>', '</td>', data) if not title: title = txt_wrap_by('<title>', '</title>', data) return title
def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>', '- 知乎', html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">', 'class="xnq xml xnh">', html) replies = [htm2txt(x)[0] for x in reply_raw_list] js = '["current_question",' + txt_wrap_by("(['current_question', ", ');', html) a = loads(js) answer_list = [] question_info = {} question_info['answer'] = answer_list question_info['tags'] = [x[0] for x in a[1][3]] question_info['title'] = title question_info['body'] = htm2txt( txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0] replies_line = zip(a[1][12], replies) for x in replies_line: try: new_ans = {} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info) + '\n')
def __call__(self, data, url): rid = url_last(url) cid = self.cid title = self.title(data) rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0 like_num = txt_wrap_by('<span class="fav-num" data-tid="', '</a>喜欢</span>', data) or 0 if like_num: like_num = txt_wrap_by('<a href="#">', '人', like_num) yield parse_like , URL_LIKE%(cid, rid), cid, rid _topic = _owner = 0 owner_id = self.user_id(data) if owner_id is None: return try: owner_id = int(owner_id) except ValueError: _owner_id = DoubanUser.by_url(owner_id) if _owner_id: owner_id = _owner_id else: _owner = owner_id owner_id = 0 topic_id = self.topic_id(data) try: topic_id = int(topic_id) except ValueError: _topic = topic_id topic_id = 0 time = self.time(data) if time: time = int_by_string(time) feed_id = douban_feed_new( cid, rid, rec_num, like_num, title, self.htm(data), time, owner_id, topic_id ) if _owner or _topic: DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save() #for user_id in user_id_by_txt(data): # yield douban_recommendation_begin_tuple(user_id) if url in EXIST_PARSE: EXIST_PARSE.remove(url)
def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def zhihu_question_parser(html, url): name = txt_wrap_by('<title>', ' - 知乎</title>', html) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map( unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def _parse_user_response(self, callback, xml): if xml: from zkit.bot_txt import txt_wrap_by soup = txt_wrap_by('<author>', '</author>', xml) user = dict(uid=txt_wrap_by('<email>', '</email>', soup), name=txt_wrap_by('<name>', '</name>', soup)) else: user = None callback(user)
def main(): cookies = (( '*****@*****.**', '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In' ), ) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Language': 'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', 'Referer:http': '//www.zhihu.com/', 'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11', } count = 0 headers['cookie'] = cookies[0][1] explore_page = fetch('http://www.zhihu.com/explore', headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page) reting_raw = txt_wrap_by("['explore_list',", ');', explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div', explore_page) result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] title_list = [ txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i)) for i in url_list ] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def parse_page(self,page,url): print "Dongxi...%s"%url title = txt_wrap_by('<div class="content_title clearfix">','</h1>',page).strip().split('>')[-1].strip() author = txt_wrap_by('<a class="link_text_blue" href="','</a>',page).strip().split('>')[-1].strip() tags = map(lambda x:x.split('>')[-1],txt_wrap_by_all("<a class='link_text_blue'",'</a>',page)) rating_num = txt_wrap_by('onclick="favorate(',')',page) content = txt_wrap_by('id="full_text">','</div',page) yield self.parse_rat,'http://dongxi.net/content/widget/page_id/%s'%rating_num,title,author,tags, url,content
def _parse_user_response(self, callback, xml): if xml: from zkit.bot_txt import txt_wrap_by soup = txt_wrap_by('<author>', '</author>', xml) user = dict( uid=txt_wrap_by('<email>', '</email>', soup), name=txt_wrap_by('<name>', '</name>', soup) ) else: user = None callback(user)
def __call__(self, data, url): rid = url_last(url) cid = self.cid title = self.title(data) rec_num = txt_wrap_by('<span class="rec-num">', '人</span>', data) or 0 like_num = txt_wrap_by('<span class="fav-num" data-tid="', '</a>喜欢</span>', data) or 0 if like_num: like_num = txt_wrap_by('<a href="#">', '人', like_num) yield parse_like, URL_LIKE % (cid, rid), cid, rid _topic = _owner = 0 owner_id = self.user_id(data) if owner_id is None: return try: owner_id = int(owner_id) except ValueError: _owner_id = DoubanUser.by_url(owner_id) if _owner_id: owner_id = _owner_id else: _owner = owner_id owner_id = 0 topic_id = self.topic_id(data) try: topic_id = int(topic_id) except ValueError: _topic = topic_id topic_id = 0 time = self.time(data) if time: time = int_by_string(time) feed_id = douban_feed_new(cid, rid, rec_num, like_num, title, self.htm(data), time, owner_id, topic_id) if _owner or _topic: DoubanFeedOwner(id=feed_id, topic=_topic, owner=_owner).save() #for user_id in user_id_by_txt(data): # yield douban_recommendation_begin_tuple(user_id) if url in EXIST_PARSE: EXIST_PARSE.remove(url)
def parse_page(self,filepath): with open(filepath) as f: page = f.read() title = txt_wrap_by('<title>译言网 | ', '</ti', page) tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page) tags = tags_wrapper.split(',') author = txt_wrap_by('<h2 id="user_info"', '/a', page) author = txt_wrap_by('">','<',author) rating = txt_wrap_by('已有<span class="number">', '</span', page) content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page) url = txt_wrap_by('wumiiPermaLink = "','"',page) if content_wrapper: content,pic_list = htm2txt(content_wrapper) else: return content = str(content) reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page) reply_list = [] for reply_wrapper in reply_wrapper_list: reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper)) Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)
def parse_index(self,page,url): link_wrap_list = txt_wrap_by_all('已翻译','<span',page) link_list = [] for link_wrap in link_wrap_list: url = txt_wrap_by('href="','"',link_wrap) if url and not url_is_fetched(url): yield self.parse_page,'http://dongxi.net/%s'%url
def __call__(self, html, url): html = txt_wrap_by('<ul class="list-m">', '</ul>', html) items = txt_wrap_by_all('<li class="item">', '</div>', html) if not items: items = txt_wrap_by_all('<h3><a', '</h3', html) links = [] for item in items: link = txt_wrap_by('href="', '"', item) id = txt_wrap_by('http://www.douban.com/event/', '/', link) id = int(id) event = ImportDoubanEvent.get(id) if not event: yield self.parse_event_page, link, id ImportDoubanEvent(id=id, event_id=0).save()
def page_parse(html): title = txt_wrap_by('<title>', '- 知乎', html) tags = txt_wrap_by_all('data-tip="t$b$', '"', html) for i in tags: print i, print title print ''
def __call__(self, html, url): html = txt_wrap_by('<ul class="list-m">', '</ul>', html) items = txt_wrap_by_all('<li class="item">', '</div>', html) if not items: items = txt_wrap_by_all('<h3><a', '</h3', html) links = [] for item in items: link = txt_wrap_by('href="', '"', item) id = txt_wrap_by('http://www.douban.com/event/', '/', link) id = int(id) event = ImportDoubanEvent.get(id) if not event: yield self.parse_event_page, link , id ImportDoubanEvent(id=id,event_id=0).save()
def wm_parser(html, url): user = txt_wrap_by('&u=', '&', url) #print user time = txt_wrap_by('<li id="maxActionTimeInMs" m="', '"', html) if time and 'm='+time not in url and int(time) > 0: yield wm_parser, url[:url.rfind('=')+1]+str(time) user_id = wm_user_id(user) for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html): if 'class="content"' in i: id = i[:i.find('"')] wm = SpiderWm.get(wmid=id) if wm is None: yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s'%id, user_id else: wm_fav(user_id, wm.id)
def htm(self, data): result = [ ] html = txt_wrap_by('<div class="topic-content">', '</div>', data) if html: result.append(html) user_id = self.user_id(data) topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data) topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply) for i in topic_reply: owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i) owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id) if owner_id != user_id: break result.append(txt_wrap_by('</div>', '<div class="operation_div"', i)) return '\n'.join(result)
def google_rank_new_by_html(uid, html): jpg = txt_wrap_by('height="200" src="//', 'photo.jpg?sz=200', html) jpg = '//%sphoto.jpg' % jpg #?sz=200 follower = txt_wrap_by('(', ')', txt_wrap_by('>圈子中有', '</h4>', html)) if follower: follower = follower.replace(',', '') if not follower.isdigit(): follower = 0 else: follower = 0 name = txt_wrap_by('<title>', '</title>', html).rsplit(' - ')[0] txt = txt_wrap_by("""介绍</h2><div """, '''</div></div><div class="''', html) if txt: txt = txt[txt.find('note">') + 6:].replace('</div>', ' ').replace( '<div>', ' ').replace('<span>', '').replace('</span>', '').strip() return google_rank_new(uid, follower, jpg, name, txt)
def wm_parser(html, url): user = txt_wrap_by('&u=', '&', url) #print user time = txt_wrap_by('<li id="maxActionTimeInMs" m="', '"', html) if time and 'm=' + time not in url and int(time) > 0: yield wm_parser, url[:url.rfind('=') + 1] + str(time) user_id = wm_user_id(user) for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html): if 'class="content"' in i: id = i[:i.find('"')] wm = SpiderWm.get(wmid=id) if wm is None: yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s' % id, user_id else: wm_fav(user_id, wm.id)
def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) print name print how_long.again(), how_long.remain, how_long.done
def main(): cookies = ( ( "*****@*****.**", "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In", ), ) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.zhihu.com", "Referer:http": "//www.zhihu.com/", "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11", } count = 0 headers["cookie"] = cookies[0][1] explore_page = fetch("http://www.zhihu.com/explore", headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page) reting_raw = txt_wrap_by("['explore_list',", ");", explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page) result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def parse_index(self,page, url): print "!" link_wrapper_list = txt_wrap_by_all('<h5 clas', '</h5', page) link_list = [] for link_wrapper in link_wrapper_list: url = txt_wrap_by('href="', '"', link_wrapper) filename = self.name_builder(url) if not url_is_fetched(url): yield self.save_page, url else: self.parse_page(filename)
def htm(self, data): result = [] html = txt_wrap_by('<div class="topic-content">', '</div>', data) if html: result.append(html) user_id = self.user_id(data) topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data) topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply) for i in topic_reply: owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i) owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id) if owner_id != user_id: break result.append( txt_wrap_by('</div>', '<div class="operation_div"', i)) return '\n'.join(result)
def wm_txt_parser(html, url, user_id): id = url.rsplit('=')[-1] name = txt_wrap_by('target="_blank">', '</a></p>', html) author = txt_wrap_by('">来自:', '<', html) link = txt_wrap_by('href="', '"', txt_wrap_by('<p class="info', '</p>', html)) like = txt_wrap_by('class="num-likeIt">', '人喜欢</a>', html) txt = txt_wrap_by('<div class="content">', ' <p class="operating">', html) time = txt_wrap_by('<span class="time">', '</span>', html) wm = wm_save(id, like, name, author, link, time, txt) wm_fav(user_id, wm.id)
def wm_txt_parser(html, url, user_id): id = url.rsplit('=')[-1] name = txt_wrap_by('target="_blank">', '</a></p>', html) author = txt_wrap_by('">来自:', '<', html) link = txt_wrap_by( 'href="', '"', txt_wrap_by('<p class="info', '</p>', html) ) like = txt_wrap_by( 'class="num-likeIt">', '人喜欢</a>', html ) txt = txt_wrap_by( '<div class="content">', ' <p class="operating">', html ) time = txt_wrap_by('<span class="time">', '</span>', html) wm = wm_save(id, like, name, author, link, time, txt) wm_fav(user_id, wm.id)
def parse_page(self,filepath): with open(filepath) as f: page = f.read() title = txt_wrap_by('<title>', '- UCD大社区', page) author = txt_wrap_by('style=" float:left; color:#999;">', '</span', page) author = txt_wrap_by('作者:', '|', author) content_wrapper = txt_wrap_by('<div id="pageContentWrap" style="font-size:13px; ">', '</div', page) url =txt_wrap_by('阅读和发布评论:<a href="','"',page) blog_url = txt_wrap_by('>推荐您进入文章源地址阅读和发布评论:<a href="','"',page) if content_wrapper: content,pic_list = htm2txt(content_wrapper.decode('utf-8','ignore' )) else: return content = str(content) tags = TAGGER.get_tag(content+title) #tags = TAGGER.get_tag(content+title) #out = dumps([title,url,tags]) #print out out = dumps([ title, content, author, tags ]) #out = dumps([ title, content, author, blog_url ]) print out
def topic_id(self, data): line = txt_wrap_by('http://site.douban.com/widget/notes/', '/', data) return line
def time(self, data): line = txt_wrap_by('<span class="datetime">', '</span>', data) return line
def title(self, data): title = txt_wrap_by('<title>', '</title>', data) return title
def parse_event_page(self, page, url, douban_event_id): title = txt_wrap_by('h1>', '</h1>', page) pic_url = txt_wrap_by('href="', '"', txt_wrap_by('class="album_photo"', '>', page)) begin_time = txt_wrap_by('ail">', '<', txt_wrap_by('开始时间', '/div', page)) end_time = txt_wrap_by('ail">', '<', txt_wrap_by('结束时间', '/div', page)) address = unicode( txt_wrap_by( 'span>', '<', txt_wrap_by('地点', 'br/>', txt_wrap_by('class="obmo">', '</div', page)))).split(' ') typ = txt_wrap_by('类型: </span>', '<br/', page) typ = txt_wrap_by('">', '/', typ) intro = txt_wrap_by('play:none">', '<a href="javasc', page) phone = txt_wrap_by('电话', '<br/', intro) if not intro: intro = txt_wrap_by('<div class="wr">', '</div>', page) if phone: phone = phone.replace(':', '').replace(':', '') event = save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id, typ) if event: yield save_pic, pic_url, event
def time(self, data): line = txt_wrap_by('<div class="note-header">', '</span>', data) line = txt_wrap_by('<span class="pl">', None, line) return line
def zhihu_topic_parser(html, url): txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html ) global FETCH_COUNT print how_long.again(), how_long.done, how_long.remain print loads(txt)[:2][0][0]
def group_id(self, data): tmp = txt_wrap_by('<form action="/group/topic_search', '</form', data) return txt_wrap_by('<input type="hidden" value="', '" name="group', tmp)
def time(self, data): line = txt_wrap_by('<div class="topic-doc">','</span>',data) line = txt_wrap_by('<span class="color-green">',None,line) return line
def title(self, data): title = txt_wrap_by('<tr><td class="tablelc"></td><td class="tablecc"><strong>标题:</strong>', '</td>', data) if not title: title = txt_wrap_by('<title>', '</title>', data) return title
def topic_id(self, data): line = txt_wrap_by('<div class="aside">', '">回', data) line = txt_wrap_by('"http://www.douban.com/group/', '/', line) return line
def htm(self, data): return txt_wrap_by('<pre class="note">', '</pre>', data)
def user_id(self, data): line = txt_wrap_by('<div class="pic">', '">', data) line = txt_wrap_by('"http://www.douban.com/people/', '/', line) return line
def htm(self, data): return txt_wrap_by(' class="note-content"><pre>', '</pre>', data)
def parse_event_page(self, page, url, douban_event_id): title = txt_wrap_by('h1>', '</h1>', page) pic_url = txt_wrap_by('href="', '"', txt_wrap_by('class="album_photo"', '>', page)) begin_time = txt_wrap_by('ail">', '<', txt_wrap_by('开始时间', '/div', page)) end_time = txt_wrap_by('ail">', '<', txt_wrap_by('结束时间', '/div', page)) address = unicode(txt_wrap_by('span>', '<', txt_wrap_by('地点', 'br/>', txt_wrap_by('class="obmo">', '</div', page)))).split(' ') typ = txt_wrap_by('类型: </span>', '<br/', page) typ = txt_wrap_by('">', '/', typ) intro = txt_wrap_by('play:none">', '<a href="javasc', page) phone = txt_wrap_by('电话', '<br/', intro) if not intro: intro = txt_wrap_by('<div class="wr">', '</div>', page) if phone: phone = phone.replace(':', '').replace(':', '') event = save_event(self, phone, address, begin_time, end_time, title, intro, douban_event_id, typ) if event: yield save_pic, pic_url, event
def leader_id(self, data): t = txt_wrap_by('组长:', '</a>', data) return txt_wrap_by('www.douban.com/people/', '/">', data) or 0
def intro(self, data): t = txt_wrap_by('class="infobox">', '<div class="rec-sec', data.replace('\r', ' ').replace('\n', ' ')) return txt_wrap_by('</p>', ' <div', t)
def zhihu_topic_parser(html, url): txt = txt_wrap_by('DZMT.push(["current_topic",', ')', html) global FETCH_COUNT print how_long.again(), how_long.done, how_long.remain print loads(txt)[:2][0][0]
def group_short_url(self, data): return txt_wrap_by('http://www.douban.com/feed/group/', '/discussion', data)
def time(self, data): line = txt_wrap_by('<span class="datetime">','</span>', data) return line
def name(self, data): return unescape( str(txt_wrap_by('<title>', '</title>', data).strip())[:-6]) #xxx小组
def member_num(self, data): #女巫店小组 浏览所有店里的小孩们 (43025 line = txt_wrap_by('/members">', ')</a>', data) if not line: return 0 return int(txt_wrap_by(' (', None, line))
def member_num(self, data): # 女巫店小组 浏览所有店里的小孩们 (43025 line = txt_wrap_by('/members">', ")</a>", data) if not line: return 0 return int(txt_wrap_by(" (", None, line))