def zhihu_question_parser(html, url): name = txt_wrap_by( '<title>', ' - 知乎</title>', html ) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map(unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">','</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>','- 知乎',html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">','class="xnq xml xnh">',html) replies = [ htm2txt(x)[0] for x in reply_raw_list ] js = '["current_question",' +txt_wrap_by("(['current_question', ",');',html) a = loads(js) answer_list=[] question_info={} question_info['answer'] = answer_list question_info['tags'] = [ x[0] for x in a[1][3] ] question_info['title'] = title question_info['body'] = htm2txt(txt_wrap_by('<div class="xvrw">','<a href="javascript',html))[0] replies_line = zip(a[1][12],replies) for x in replies_line: try: new_ans={} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info)+'\n')
def zhihu_question_parser(html, url): name = txt_wrap_by('<title>', ' - 知乎</title>', html) name = unescape(name) if '<h3>邀请别人回答问题</h3>' in html: answer_count = txt_wrap_by('<span id="xhrw">', ' 个答案</span>', html) else: answer_count = txt_wrap_by('<h3 style="margin: 0 0 5px;">', ' 个答案</', html) tag = map( unescape, txt_wrap_by_all('<a class="xjl" href="javascript:;">', '</a>', html)) #print tag[0] answer_count = int(answer_count or 0) if answer_count: txt = filter(bool, txt_wrap_by_all('<div class="xmrw">', '</div>', html)) if not txt: print url print name #raw_input() else: print txt[0] else: if "个答案" in html and ("0 个答案" not in html) and "还没有答案" not in html: print url print html #raw_input() txt = [] RESULT.append((answer_count, url, name, tag, [htm2txt(i) for i in txt])) print how_long.again(), how_long.remain, how_long.done
def page_parse(htm_file): html = open(htm_file).read() title = txt_wrap_by('<title>', '- 知乎', html) tags = txt_wrap_by_all('xgm" href="javascript:;">', '</', html) reply_raw_list = txt_wrap_by_all('<div class="xmo">', 'class="xnq xml xnh">', html) replies = [htm2txt(x)[0] for x in reply_raw_list] js = '["current_question",' + txt_wrap_by("(['current_question', ", ');', html) a = loads(js) answer_list = [] question_info = {} question_info['answer'] = answer_list question_info['tags'] = [x[0] for x in a[1][3]] question_info['title'] = title question_info['body'] = htm2txt( txt_wrap_by('<div class="xvrw">', '<a href="javascript', html))[0] replies_line = zip(a[1][12], replies) for x in replies_line: try: new_ans = {} new_ans['name'] = x[0][2][0] new_ans['answer'] = x[1] new_ans['id'] = x[0][2][1] new_ans['signature'] = x[0][3] new_ans['votes'] = x[0][4] answer_list.append(new_ans) except: continue out_file.write(dumps(question_info) + '\n')
def main(): cookies = (( '*****@*****.**', '_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In' ), ) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Language': 'en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.zhihu.com', 'Referer:http': '//www.zhihu.com/', 'User-Agent': 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11', } count = 0 headers['cookie'] = cookies[0][1] explore_page = fetch('http://www.zhihu.com/explore', headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', '</div', explore_page) reting_raw = txt_wrap_by("['explore_list',", ');', explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode('utf-8'), i[3][2].encode('utf-8')] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', '</div', explore_page) result_label = [txt_wrap_by_all('">', '</a', i) for i in label_list] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] title_list = [ txt_wrap_by('">', '<', txt_wrap_by('href="', '/a>', i)) for i in url_list ] url_list = txt_wrap_by_all('<h2', '</h2>', explore_page) id_list = [txt_wrap_by('question/', '/answer', i) for i in url_list] url_list = ['http://www.zhihu.com/question/%s' % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def read_next(start, offset): data = { 'offset':offset, 'start':start } result = [] data = urlencode(data) headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7', 'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language':'zh-cn,zh;q=0.5', 'Accept-Charset':'gb18030,utf-8;q=0.7,*;q=0.7', 'Content-type':'application/x-www-form-urlencoded' } headers['Cookie'] = """__utma=155987696.1466564421.1323058814.1323081063.1323082137.3; __utmz=155987696.1323082137.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=site%3Azhihu.com; __utmv=155987696.Logged%20In; _xsrf=5f0d189d485b43cca16068abe2d981ec; __utmc=155987696; __utmb=155987696.70.10.1323082137; checkcode=d3Nsag==|1323081329|606c2864ea806947dae5b5a8d7ab17c2ad22894e; q_c0=MTY2MzIxfFZHUkQxQ2xweUp6Y1czMDk=|1323083404|aabdf01be80a6e1b1c2f6817b03ef2de8a62eb2f""" request = urllib2.Request( url='http://www.zhihu.com/log/questions', data=data, headers=headers ) urlopener = urllib2.build_opener() r = urlopener.open(request) j = r.read() j = loads(j) html = j['msg'][1] name_list = txt_wrap_by_all('''</h2> <div> <a''', '<', html) id_list = txt_wrap_by_all('logitem-' , '">', html) begin = '<a href="/question/' end = '</a' for id, name, i in zip(id_list, name_list, txt_wrap_by_all(begin, end, html)): i = i.split('">', 1) i.append(id) name = unescape(name).strip()[14:].split('">', 1) if len(name) < 2: name = '?', '?' i.extend(name) result.append(i) return 20+offset, result
def zhihu_topic_parser(html, url): global FETCH_COUNT #txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html ) #print loads(txt)[:2][0][0] question_id_list = map(int, filter(str.isdigit, txt_wrap_by_all('href="/question/', '">', html))) QUESTION_ID_SET.update(question_id_list) #QUESTION_ID_SET feed_id_list = txt_wrap_by_all('id="feed-', '">', html) print feed_id_list # for i in feed_id_list: # yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i if len(feed_id_list) >= 20: last_one = feed_id_list[-1] yield zhihu_topic_feed, {'url':url, 'data':urlencode(dict(start=last_one, offset=20))}, 20
def find_next(page_file): global question_set_fetched, question_set with open(page_file) as page: link = set( txt_wrap_by_all('<a class="xu" name="rlq" id="rlq-', '"', page.read())) question_set |= link
def page_parse(html): title = txt_wrap_by('<title>', '- 知乎', html) tags = txt_wrap_by_all('data-tip="t$b$', '"', html) for i in tags: print i, print title print ''
def __call__(self, html, url): html = txt_wrap_by('<ul class="list-m">', '</ul>', html) items = txt_wrap_by_all('<li class="item">', '</div>', html) if not items: items = txt_wrap_by_all('<h3><a', '</h3', html) links = [] for item in items: link = txt_wrap_by('href="', '"', item) id = txt_wrap_by('http://www.douban.com/event/', '/', link) id = int(id) event = ImportDoubanEvent.get(id) if not event: yield self.parse_event_page, link , id ImportDoubanEvent(id=id,event_id=0).save()
def user_id_by_txt(htm): r = [ str(uid).rstrip('/') for uid in set( txt_wrap_by_all('href="http://www.douban.com/people/', '"', htm)) ] r = [i for i in r if i.isalnum()] return r
def func_url(self, title): t = [ i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title) ] url, topic_name = t[1] return parse_topic_htm, url
def wm_parser(html, url): if "&p=" not in url: REAL_USER.add(url.rsplit("=",1)[-1]) page_id = txt_wrap_by_all(' pageid="','"',html) if page_id: page_id = int(page_id[-1]) for i in xrange(1,page_id+1): yield wm_parser, url+"&p=%s"%i for user_name in txt_wrap_by_all(' href="/user/','"', html): if "/" not in user_name: if (user_name in EXIST_USER) or (user_name in REAL_USER): continue EXIST_USER.add(user_name) yield wm_parser , "http://www.wumii.com/user/list/followings?u=%s"%user_name yield wm_parser , "http://www.wumii.com/user/list/fans?u=%s"%user_name
def parse_index(self,page,url): link_wrap_list = txt_wrap_by_all('已翻译','<span',page) link_list = [] for link_wrap in link_wrap_list: url = txt_wrap_by('href="','"',link_wrap) if url and not url_is_fetched(url): yield self.parse_page,'http://dongxi.net/%s'%url
def __call__(self, html, url): html = txt_wrap_by('<ul class="list-m">', '</ul>', html) items = txt_wrap_by_all('<li class="item">', '</div>', html) if not items: items = txt_wrap_by_all('<h3><a', '</h3', html) links = [] for item in items: link = txt_wrap_by('href="', '"', item) id = txt_wrap_by('http://www.douban.com/event/', '/', link) id = int(id) event = ImportDoubanEvent.get(id) if not event: yield self.parse_event_page, link, id ImportDoubanEvent(id=id, event_id=0).save()
def parse_page(self,filepath): with open(filepath) as f: page = f.read() title = txt_wrap_by('<title>译言网 | ', '</ti', page) tags_wrapper = txt_wrap_by('wumiiTags = "', '"', page) tags = tags_wrapper.split(',') author = txt_wrap_by('<h2 id="user_info"', '/a', page) author = txt_wrap_by('">','<',author) rating = txt_wrap_by('已有<span class="number">', '</span', page) content_wrapper = txt_wrap_by('id="conBox">','<div class="article_content">',page) url = txt_wrap_by('wumiiPermaLink = "','"',page) if content_wrapper: content,pic_list = htm2txt(content_wrapper) else: return content = str(content) reply_wrapper_list = txt_wrap_by_all('class="comment_content">', '</ul', page) reply_list = [] for reply_wrapper in reply_wrapper_list: reply_list.append(txt_wrap_by('<p>', '</p', reply_wrapper)) Spider.insert(title, tags, content, author, rating ,url, reply_list, pic_list)
def wm_parser(html, url): if "&p=" not in url: REAL_USER.add(url.rsplit("=", 1)[-1]) page_id = txt_wrap_by_all(' pageid="', '"', html) if page_id: page_id = int(page_id[-1]) for i in xrange(1, page_id + 1): yield wm_parser, url + "&p=%s" % i for user_name in txt_wrap_by_all(' href="/user/', '"', html): if "/" not in user_name: if (user_name in EXIST_USER) or (user_name in REAL_USER): continue EXIST_USER.add(user_name) yield wm_parser, "http://www.wumii.com/user/list/followings?u=%s" % user_name yield wm_parser, "http://www.wumii.com/user/list/fans?u=%s" % user_name
def read_next(start, offset): data = {'offset': offset, 'start': start} result = [] data = urlencode(data) headers = { 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.7) Gecko/20091221 Firefox/3.5.7', 'Accept': ' text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5', 'Accept-Language': 'zh-cn,zh;q=0.5', 'Accept-Charset': 'gb18030,utf-8;q=0.7,*;q=0.7', 'Content-type': 'application/x-www-form-urlencoded' } headers[ 'Cookie'] = """__utma=155987696.1466564421.1323058814.1323081063.1323082137.3; __utmz=155987696.1323082137.3.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic|utmctr=site%3Azhihu.com; __utmv=155987696.Logged%20In; _xsrf=5f0d189d485b43cca16068abe2d981ec; __utmc=155987696; __utmb=155987696.70.10.1323082137; checkcode=d3Nsag==|1323081329|606c2864ea806947dae5b5a8d7ab17c2ad22894e; q_c0=MTY2MzIxfFZHUkQxQ2xweUp6Y1czMDk=|1323083404|aabdf01be80a6e1b1c2f6817b03ef2de8a62eb2f""" request = urllib2.Request(url='http://www.zhihu.com/log/questions', data=data, headers=headers) urlopener = urllib2.build_opener() r = urlopener.open(request) j = r.read() j = loads(j) html = j['msg'][1] name_list = txt_wrap_by_all('''</h2> <div> <a''', '<', html) id_list = txt_wrap_by_all('logitem-', '">', html) begin = '<a href="/question/' end = '</a' for id, name, i in zip(id_list, name_list, txt_wrap_by_all(begin, end, html)): i = i.split('">', 1) i.append(id) name = unescape(name).strip()[14:].split('">', 1) if len(name) < 2: name = '?', '?' i.extend(name) result.append(i) return 20 + offset, result
def zhihu_topic_feed(html, url, offset): o = loads(html) #pprint(o) id_list = txt_wrap_by_all('id=\\"feed-', '\\"', html) question_id_list = txt_wrap_by_all('href=\\"/question/', '\\"', html) QUESTION_ID_SET.update(map(int,question_id_list)) print ">>>", len(QUESTION_ID_SET),'question', how_long.done, how_long.remain, how_long.estimate() # for i in id_list: # yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i # print id_list if len(id_list)>3: offset += o['msg'][0] yield zhihu_topic_feed, {'url':url['url'], 'data':urlencode(dict(start=id_list[-1], offset=offset))}, offset else: print "done", how_long.again(), how_long.done, how_long.remain
def user_id_by_txt(htm): r = [ str(uid).rstrip('/') for uid in set(txt_wrap_by_all('href="http://www.douban.com/people/', '"', htm)) ] r = [i for i in r if i.isalnum()] return r
def main(): cookies = ( ( "*****@*****.**", "_xsrf=7ed86e897bae4b9e8cf3e660efed7baf; q_c0=MTk2OTAzfGdmWDM5Q2pZNVpaUW9UTzA=|1326267926|eedfe70f85add0db0ecda1e73200cac9b085ecc6; __utma=155987696.1247389772.1322703824.1326190947.1326266591.29; __utmb=155987696.34.10.1326266591; __utmc=155987696; __utmz=155987696.1325768571.27.6.utmcsr=zhihu.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utmv=155987696.Logged%20In", ), ) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Charset": "ISO-8859-1,utf-8;q=0.7,*;q=0.3", "Accept-Language": "en,en-US;q=0.8,zh-CN;q=0.6,zh;q=0.4", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "www.zhihu.com", "Referer:http": "//www.zhihu.com/", "User-Agent": "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11", } count = 0 headers["cookie"] = cookies[0][1] explore_page = fetch("http://www.zhihu.com/explore", headers=headers) entry_list = txt_wrap_by_all('<div class="xxn">', "</div", explore_page) reting_raw = txt_wrap_by("['explore_list',", ");", explore_page) data = loads(reting_raw) author_list = [[i[3][1][0].encode("utf-8"), i[3][2].encode("utf-8")] for i in data] rating_list = [i[3][3] for i in data] label_list = txt_wrap_by_all('"padding:3px 0 0" class="xm">', "</div", explore_page) result_label = [txt_wrap_by_all('">', "</a", i) for i in label_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] title_list = [txt_wrap_by('">', "<", txt_wrap_by('href="', "/a>", i)) for i in url_list] url_list = txt_wrap_by_all("<h2", "</h2>", explore_page) id_list = [txt_wrap_by("question/", "/answer", i) for i in url_list] url_list = ["http://www.zhihu.com/question/%s" % id for id in id_list] entry_list = zip(title_list, rating_list, result_label, author_list, url_list, entry_list) for entry in entry_list: content, pic_list = htm2txt(entry[5]) Spider.insert(entry[0], entry[2], content, entry[3][0], entry[1], entry[4], [], pic_list)
def parse_page(self,page,url): print "Dongxi...%s"%url title = txt_wrap_by('<div class="content_title clearfix">','</h1>',page).strip().split('>')[-1].strip() author = txt_wrap_by('<a class="link_text_blue" href="','</a>',page).strip().split('>')[-1].strip() tags = map(lambda x:x.split('>')[-1],txt_wrap_by_all("<a class='link_text_blue'",'</a>',page)) rating_num = txt_wrap_by('onclick="favorate(',')',page) content = txt_wrap_by('id="full_text">','</div',page) yield self.parse_rat,'http://dongxi.net/content/widget/page_id/%s'%rating_num,title,author,tags, url,content
def parse_content(txt): #id = txt_wrap_by('<a href="/question/', '/log" class="xrv">', txt) #t = unescape(txt_wrap_by('<title>', ' - 知乎</title>', txt)) tlist = txt_wrap_by_all('<div class="xmrw">', '</div>', txt) r = [htm2txt(i) for i in tlist if i.strip()] #for pos, i in enumerate(r[:3]): # print pos, len(i), i # print "\n" return r
def zhihu_topic_parser(html, url): global FETCH_COUNT #txt = txt_wrap_by( 'DZMT.push(["current_topic",', ')', html ) #print loads(txt)[:2][0][0] question_id_list = map( int, filter(str.isdigit, txt_wrap_by_all('href="/question/', '">', html))) QUESTION_ID_SET.update(question_id_list) #QUESTION_ID_SET feed_id_list = txt_wrap_by_all('id="feed-', '">', html) print feed_id_list # for i in feed_id_list: # yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i if len(feed_id_list) >= 20: last_one = feed_id_list[-1] yield zhihu_topic_feed, { 'url': url, 'data': urlencode(dict(start=last_one, offset=20)) }, 20
def parse_index(self,page, url): print "!" link_wrapper_list = txt_wrap_by_all('<h5 clas', '</h5', page) link_list = [] for link_wrapper in link_wrapper_list: url = txt_wrap_by('href="', '"', link_wrapper) filename = self.name_builder(url) if not url_is_fetched(url): yield self.save_page, url else: self.parse_page(filename)
def func_url(self, title): t = [i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title)] url , note_title = t[1] if url.startswith('http://www.douban.com/note/'): func = parse_note_people_htm elif url.startswith('http://site.douban.com/widget/notes/'): func = parse_note_site_htm else: func = 0 return func, url
def zhihu_topic_feed(html, url, offset): o = loads(html) #pprint(o) id_list = txt_wrap_by_all('id=\\"feed-', '\\"', html) question_id_list = txt_wrap_by_all('href=\\"/question/', '\\"', html) QUESTION_ID_SET.update(map(int, question_id_list)) print ">>>", len( QUESTION_ID_SET ), 'question', how_long.done, how_long.remain, how_long.estimate() # for i in id_list: # yield zhihu_question_parser, "http://www.zhihu.com/question/%s"%i # print id_list if len(id_list) > 3: offset += o['msg'][0] yield zhihu_topic_feed, { 'url': url['url'], 'data': urlencode(dict(start=id_list[-1], offset=offset)) }, offset else: print "done", how_long.again(), how_long.done, how_long.remain
def func_url(self, title): t = [ i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title) ] url, note_title = t[1] if url.startswith('http://www.douban.com/note/'): func = parse_note_people_htm elif url.startswith('http://site.douban.com/widget/notes/'): func = parse_note_site_htm else: func = 0 return func, url
def wm_parser(html, url): user = txt_wrap_by('&u=', '&', url) #print user time = txt_wrap_by('<li id="maxActionTimeInMs" m="', '"', html) if time and 'm=' + time not in url and int(time) > 0: yield wm_parser, url[:url.rfind('=') + 1] + str(time) user_id = wm_user_id(user) for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html): if 'class="content"' in i: id = i[:i.find('"')] wm = SpiderWm.get(wmid=id) if wm is None: yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s' % id, user_id else: wm_fav(user_id, wm.id)
def wm_parser(html, url): user = txt_wrap_by('&u=', '&', url) #print user time = txt_wrap_by('<li id="maxActionTimeInMs" m="', '"', html) if time and 'm='+time not in url and int(time) > 0: yield wm_parser, url[:url.rfind('=')+1]+str(time) user_id = wm_user_id(user) for i in txt_wrap_by_all(' itemid="', '<p class="operating">', html): if 'class="content"' in i: id = i[:i.find('"')] wm = SpiderWm.get(wmid=id) if wm is None: yield wm_txt_parser, 'http://www.wumii.com/reader/article?id=%s'%id, user_id else: wm_fav(user_id, wm.id)
def htm(self, data): result = [ ] html = txt_wrap_by('<div class="topic-content">', '</div>', data) if html: result.append(html) user_id = self.user_id(data) topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data) topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply) for i in topic_reply: owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i) owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id) if owner_id != user_id: break result.append(txt_wrap_by('</div>', '<div class="operation_div"', i)) return '\n'.join(result)
def htm(self, data): result = [] html = txt_wrap_by('<div class="topic-content">', '</div>', data) if html: result.append(html) user_id = self.user_id(data) topic_reply = txt_wrap_by('<ul class="topic-reply">', '</ul>', data) topic_reply = txt_wrap_by_all(' <div class="reply-doc">', ' class="lnk-reply">回应</a>', topic_reply) for i in topic_reply: owner_id = txt_wrap_by('<div class="bg-img-green">', '</h4>', i) owner_id = txt_wrap_by('<a href="http://www.douban.com/people/', '/">', owner_id) if owner_id != user_id: break result.append( txt_wrap_by('</div>', '<div class="operation_div"', i)) return '\n'.join(result)
def find_next(page_file): global question_set_fetched,question_set with open(page_file) as page: link = set(txt_wrap_by_all('<a class="xu" name="rlq" id="rlq-','"', page.read())) question_set|=link
def func_url(self, title): t = [i.split('">', 1) for i in txt_wrap_by_all('<a href="', '</a>', title)] url , topic_name = t[1] return parse_topic_htm, url
touch = 'http://42qu.com/google_plus?q=' while buffer: uid = buffer.pop() passed.add(uid) url = 'https://plus.google.com/%s/posts?hl=en' % uid print url try: html = urlopen(url, timeout=60).read() except: traceback.print_exc() continue if not has_cn(html): continue for i in txt_wrap_by_all('href="/', '"', html): if i.isdigit(): i = int(i) if i in passed: continue if i in buffer: continue buffer.add(i) print i, datetime.datetime.now() sys.stdout.flush() try: urlopen(touch + str(i), timeout=30) except: traceback.print_exc() continue