def extract_user_info(doc): try: scripts = util.extract_script(doc) script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__63"' ) if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__62"' ) if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__61"' ) if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__59"' ) html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) lis = html.xpath(r'//ul/li') info = [] for li in lis: text = li.xpath("string()") info.append(util.clean_text(text)) level_info = extract_level_info(doc) if level_info: info.append(level_info) return info except: traceback.print_exc() return None
def extract_user_info(doc): try: scripts = util.extract_script(doc) script = util.select_script(scripts, r'"domid":"Pl_Official_PersonalInfo__63"') if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__62"') if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__61"') if script is None: script = util.select_script( scripts, r'"domid":"Pl_Official_PersonalInfo__59"') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) lis = html.xpath(r'//ul/li') info = [] for li in lis: text = li.xpath("string()") info.append(util.clean_text(text)) level_info = extract_level_info(doc) if level_info: info.append(level_info) return info except: traceback.print_exc() return None
def extract_hot_weibo(doc): try: scripts = util.extract_script(doc) print len(scripts) script = util.select_script(scripts, r'"domid":"Pl_Core_NewMixFeed__3"') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) divs = html.xpath('//div[@action-type="feed_list_item"]') weibos = [] for div in divs: try: weibo = Weibo() weibo["mid"] = div.attrib.get("mid") _div = div.xpath('.//a[@class="W_texta W_fb"]')[0] usercard = _div.attrib.get("usercard") end = usercard.index("&") weibo["uid"] = usercard[len("id="):end] link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0] weibo["url"] = link.attrib.get("href") extract_content(div, weibo) extract_date_source(div, weibo) weibos.append(weibo) except: traceback.print_exc() return weibos except: traceback.print_exc()
def extract_searched_weibo(doc, page_num=None): try: scripts = util.extract_script(doc) script = util.select_script(scripts, r'"pid":"pl_weibo_direct"') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) divs = html.xpath('//div[@action-type="feed_list_item"]') weibos = [] for div in divs: try: weibo = Weibo() weibo["mid"] = div.attrib.get("mid") _div = div.xpath('.//a[@class="W_texta W_fb"]')[0] usercard = _div.attrib.get("usercard") end = usercard.index("&") weibo["uid"] = usercard[len("id="):end] weibos.append(weibo) except: traceback.print_exc() if page_num: try: lis = html.xpath(r'//span[@class="list"]/div/ul/li') text = lis[-1].xpath(".//text()")[0] total = int(text[1:-1]) return weibos, total except: return weibos, 1 else: return weibos except: if page_num: return None, None else: return None
def extract_search_result_count(doc): scripts = util.extract_script(doc) script = util.select_script(scripts, r'"pid":"pl_weibo_direct"') text = script.text.strip() text = text.encode("utf-8", "ignore") print text p = re.compile("找到(\d+)条结果") print p.search(text)
def extract_content_html(self, html): """ Extracting html code that contains weibo content. """ scripts = util.extract_script(html) script = util.select_script(scripts, r'pl.content.homefeed.index') text = script.text.strip() return util.extract_html_from_script(text)
def extract_content_html(self, html, single=False): """ Extracting html code that contains weibo content. """ scripts = util.extract_script(html) if not single: script = util.select_script( scripts, r'"domid":"Pl_Official_MyProfileFeed' ) if not script: script = util.select_script( scripts, r'"domid":"v6_pl_content_homefeed"' ) else: script = util.select_script( scripts, r'pl.content.weiboDetail.index' ) text = script.text.strip() return util.extract_html_from_script(text)
def extract_content_html(self, html, single=False, hot=False): """ Extracting html code that contains weibo content. """ scripts = util.extract_script(html) if hot: script = util.select_script(scripts, r'"domid":"Pl_Core_NewMixFeed__3"') elif not single: script = util.select_script(scripts, r'"domid":"Pl_Official_MyProfileFeed') if not script: script = util.select_script( scripts, r'"domid":"v6_pl_content_homefeed"') else: script = util.select_script(scripts, r'pl.content.weiboDetail.index') text = script.text.strip() return util.extract_html_from_script(text)
def extract_content_html(self, html): """ Extracting html code that contains weibo content. """ scripts = util.extract_script(html) script = util.select_script( scripts, r'pl.content.homefeed.index' ) text = script.text.strip() return util.extract_html_from_script(text)
def extract_topic(doc): scripts = util.extract_script(doc) script = util.select_script(scripts, '"domid":"v6_pl_rightmod_recominfo"') text = script.text.strip() doc = util.extract_html_from_script(text) html = etree.HTML(doc) links = html.xpath('//ul[@class="hot_topic"]/li//a') topics = [] for link in links: topics.append((link.attrib["href"], link.text.strip())) return topics
def extract_level_info(doc): try: scripts = util.extract_script(doc) script = util.select_script(scripts, r'"domid":"Pl_Official_RightGrowNew') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) p = html.xpath(r'//p[@class="level_info"]') if p: text = p[0].xpath("string()") info = util.clean_text(text) return info except: traceback.print_exc() return None
def extract_level_info(doc): try: scripts = util.extract_script(doc) script = util.select_script( scripts, r'"domid":"Pl_Official_RightGrowNew' ) html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) p = html.xpath(r'//p[@class="level_info"]') if p: text = p[0].xpath("string()") info = util.clean_text(text) return info except: traceback.print_exc() return None
def extract_relation(doc): scripts = util.extract_script(doc) script = util.select_script(scripts, r'pl.content.followTab.index') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) datas = html.xpath(r'.//ul[@class="follow_list"]/li/@action-data') for data in datas: try: followee = {} splits = data.split("&") for split in splits: _splits = split.split("=") followee[_splits[0]] = _splits[1] yield followee except: traceback.print_exc() continue
def extract_inbox_comment(data): comments = [] try: scripts = util.extract_script(data) script = util.select_script(scripts, r'"domid":"v6_pl_content_commentlist"') text = script.text.strip() doc = util.extract_html_from_script(text) html = etree.HTML(doc) divs = html.xpath('//div[@node-type="feed_commentList_comment"]') except: return comments for div in divs: try: weibo_url, comment = extract_individual_comment(div) comments.append((weibo_url, comment)) except: pass return comments
def extract_user(doc, page_num=None): try: scripts = util.extract_script(doc) script = util.select_script(scripts, r'"pid":"pl_user_feedList"') json_data = re.findall("\(({.*})\)", script.text)[0] json_data = json.loads(json_data) html = etree.HTML(json_data["html"]) divs = html.xpath(r'//div[@class="list_person clearfix"]') users = [] for div in divs: try: user = {} detail = div.xpath(r'.//div[@class="person_detail"]')[0] _as = detail.xpath(r'.//p[@class="person_name"]/a') if len(_as) >= 1: user["uid"] = _as[0].attrib.get("uid") user["nick"] = _as[0].attrib.get("title") user["home_url"] = _as[0].attrib.get("href") if len(_as) > 1: if _as[1].attrib.get("alt") is not None: user["verify"] = _as[1].attrib.get("alt") users.append(user) except: traceback.print_exc() continue if page_num: try: lis = html.xpath(r'//span[@class="list"]/div/ul/li') text = lis[-1].xpath(".//text()")[0] total = int(text[1:-1]) return users, total except: return users, 1 else: return users except: if page_num: return None, None else: return None
def extract_inbox_comment(data): comments = [] try: scripts = util.extract_script(data) script = util.select_script( scripts, r'"domid":"v6_pl_content_commentlist"' ) text = script.text.strip() doc = util.extract_html_from_script(text) html = etree.HTML(doc) divs = html.xpath('//div[@node-type="feed_commentList_comment"]') except: return comments for div in divs: try: weibo_url, comment = extract_individual_comment(div) comments.append((weibo_url, comment)) except: pass return comments
def extract_searched_weibo(doc, page_num=None): try: scripts = util.extract_script(doc) script = util.select_script(scripts, r'"pid":"pl_weibo_direct"') html = util.extract_html_from_script(script.text.strip()) html = etree.HTML(html) divs = html.xpath('//div[@action-type="feed_list_item"]') weibos = [] for div in divs: try: weibo = Weibo() weibo["mid"] = div.attrib.get("mid") _div = div.xpath('.//a[@class="W_texta W_fb"]')[0] usercard = _div.attrib.get("usercard") end = usercard.index("&") weibo["uid"] = usercard[len("id="):end] link = div.xpath('.//*[@class="feed_from W_textb"]/a')[0] weibo["url"] = link.attrib.get("href") extract_content(div, weibo) extract_date_source(div, weibo) weibos.append(weibo) except: traceback.print_exc() if page_num: try: lis = html.xpath(r'//span[@class="list"]/div/ul/li') text = lis[-1].xpath(".//text()")[0] total = int(text[1:-1]) return weibos, total except: return weibos, 1 else: return weibos except: if page_num: return None, None else: return None