def parse_wenda_page(self, response): item = WendaAskItem() item["tagName"] = self.keyword item["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] if response.xpath("//meta[@name='description']"): item["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] else: item["description"] = response.xpath( "//meta[@name='Description']/@content").extract()[0] item["title"] = response.css("div.dtl-top h1::text").get() item["images"] = [] ptags = response.xpath("//div[@class='atcle-ms']/p") text = [] for p in ptags: text.append(p.xpath("string()").extract()[0].strip()) content = "<br>".join(text) item["content"] = content item["addtime"] = response.css( "div.dtl-info span:nth-child(1)::text").get() item["source"] = response.request.url item["username"] = "" item["headPortrait"] = "" item["askList"] = [] item["topicUrl"] = "" replyItem = WendaReplayItem() replyItem["title"] = item["title"] replyItem["username"] = response.css( "dl.dtl-ys dd b a:nth-child(1)::text").get() replyItem["images"] = [] ptags = response.xpath("//div[@class='dtl-reply']/p") text = [] for p in ptags: text.append(p.xpath("string()").extract()[0].strip()) content = "<br>".join(text) replyItem["content"] = content replyItem["addtime"] = response.css( "div.dtl-list div.dtl-time span::text").get() item["askList"].append(replyItem) yield item
def parse_wenda_page(self, response): item = WendaAskItem() item["tagName"] = self.keyword item["keyword"] = response.xpath( "//meta[@name='Keywords']/@content").extract()[0] item["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] item["title"] = response.css( "div.ask_cont p.ask_tit::text").get().strip() item["images"] = [] item["content"] = response.css( "div.ask_cont div.ask_hid p.txt_ms::text").get().strip() item["addtime"] = response.css( "div.ask_cont p.txt_nametime span:nth-child(2)::text").get().strip( ) item["source"] = response.request.url item["username"] = response.css( "div.ask_cont p.txt_nametime span:nth-child(1)::text").get().strip( ) item["headPortrait"] = "" item["askList"] = [] item["topicUrl"] = "" replys = response.css("div.selected div.sele_all") for reply in replys: replyItem = WendaReplayItem() replyItem["title"] = item["title"] replyItem["username"] = reply.css( "div.doc_txt p.doc_xinx span:nth-child(1)::text").get().strip( ) replyItem["likes"] = 0 replyItem["headPortrait"] = reply.css( "div.doc_img a img::attr(src)").get() replyItem["images"] = [] replyItem["content"] = reply.css("p.sele_txt::text").get().strip() replyItem["addtime"] = reply.css( "div.doc_t_strip div.zwAll p::text").get() item["askList"].append(replyItem) print("wenda: ", item) yield item
def parse_next_ask(self, response): wendaAskItem = response.meta["wendaAskItem"] reply_list = response.xpath("//ul[@class='qa-answer-list']") if reply_list: replys = reply_list.xpath("./li[@class='answer-item']") for reply in replys: wendaReply = WendaReplayItem() wendaReply["title"] = wendaAskItem["title"] wendaReply["username"] = reply.xpath( ".//ul[@class='qa-meta']/li[@class='username']//span/text()" ).extract()[0] wendaReply["images"] = [] wendaReply["content"] = reply.xpath( ".//div[@class='answer-text']/text()").extract()[0].strip( ) wendaReply["addtime"] = reply.xpath( ".//ul[@class='qa-meta']/li[@class='timestamp']/abbr/@title" ).extract()[0] wendaReply["source"] = wendaAskItem["source"] wendaReply["headPortrait"] = reply.xpath( "./ul[@class='qa-meta']/li[@class='useravatar']/a/img/@src" ).extract()[0] wendaReply["likes"] = reply.xpath( "./a[@class='qa-answer-list-vote']/span[@class='n']/em/text()" ).extract()[0] wendaAskItem["askList"].append(wendaReply) current_page = int( response.xpath( "//div[@class='pagejump']/span[@class='current']/text()"). extract()[0]) total_page_text = response.xpath( "//div[@class='pagejump']/span[@class='page-number']/text()" ).extract()[0] result = re.search(r"\d+\.?\d*", total_page_text) total_num = int(result.group()) if current_page == total_num: yield wendaAskItem
def parse_wenda(self, response): item = WendaAskItem() item["tagName"] = self.keyword item["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] item["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] item["title"] = response.xpath( "//h1[@class='ts-title']/text()").extract()[0] item["images"] = [] content_table = response.xpath( "//div[@class='theme_box post_list']//div[@class='topic_main list_box']//div[@class='fsz_main']") if content_table: item["content"] = response.xpath( "//div[@class='theme_box post_list']//div[@class='topic_main list_box']//div[@class='fsz_main']//table//td/text()").extract()[0].strip() else: item["content"] = "" addtime_span_tag = response.xpath( "//div[@class='theme_box post_list']//div[@class='auth_info_main']/em/span") if addtime_span_tag: item["addtime"] = addtime_span_tag.xpath("./@title").extract()[0] else: item["addtime"] = response.xpath( "//div[@class='auth_info_main']/em").extract()[0].strip().split("发表于 ")[1] item["source"] = response.meta["origin_url"] item["username"] = response.xpath( "//div[@class='theme_box post_list']//div[@class='auth_info_bar']//a/text()").extract()[0] if response.xpath( "//div[@class='theme_box post_list']/div[@class='auth_info']//div[@class='avatar']/img"): item["headPortrait"] = "" else: item["headPortrait"] = response.xpath( "//div[@class='theme_box post_list']/div[@class='auth_info']//div[@class='avatar']/img/@src").extract()[0] item["askList"] = [] item["topicUrl"] = "" replys = response.xpath( "//div[@class='list_box theme_reply']//div[contains(@class, 'post_list')]") for reply in replys: replyItem = WendaReplayItem() replyItem["title"] = item["title"] username_tag = reply.xpath( ".//div[@class='user_name']/a") if username_tag: replyItem["username"] = reply.xpath( ".//div[@class='user_name']/a/text()").extract()[0] else: continue replyItem["images"] = [] content_table = reply.xpath(".//div[@class='fsz_main']") if content_table: replyItem["content"] = reply.xpath( ".//div[@class='fsz_main']/table//td/text()").extract()[0].strip() else: replyItem["content"] = "" replyItem['addtime'] = reply.xpath( ".//div[@class='auth_info_main']/em/text()").extract()[0].split("发表于 ")[1] replyItem["source"] = item["source"] avatar_tag = reply.xpath( ".//div[@class='avatar']/img") if avatar_tag: replyItem["headPortrait"] = reply.xpath( ".//div[@class='avatar']/img/@src").extract()[0] else: replyItem["headPortrait"] = "" replyItem["likes"] = 0 item["askList"].append(replyItem) yield item tag_box = response.xpath( "//div[@class='mod_s tag_box']/div[@class='mod_con']/a/text()").extract() keywordItem = KeywordItem() keywordItem["keywordList"] = [] for keyword in tag_box: keyword = keyword.strip() keywordItem["title"] = keyword keywordItem["keywordList"].append(keyword) keywordItem["source"] = f'https://www.icheruby.com/tags/' yield keywordItem
def parse_ask(self, response): wendaAskItem = WendaAskItem() wendaAskItem["tagName"] = self.keyword wendaAskItem["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] wendaAskItem["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] wendaAskItem["title"] = response.xpath( "//div[@class='qa-title']/h1/text()").extract()[0] wendaAskItem["images"] = [] wendaAskItem["content"] = response.xpath( "string(//blockquote[@class='qa-text'])").extract()[0].strip() wendaAskItem["addtime"] = response.xpath( "//div[@class='qa-related']//span[@class='source']/abbr/text()" ).extract()[0] wendaAskItem["source"] = response.meta["origin_url"] if response.xpath( "//div[@class='qa-related']/div[@class='qa-contributor']/ul/li[1]/a" ): wendaAskItem["username"] = response.xpath( "//div[@class='qa-related']/div[@class='qa-contributor']/ul/li[1]/a/span/text()" ).extract()[0] else: wendaAskItem["username"] = response.xpath( "//div[@class='qa-related']/div[@class='qa-contributor']/ul/li[1]/span/text()" ).extract()[0] wendaAskItem["headPortrait"] = "" wendaAskItem["askList"] = [] wendaAskItem["topicUrl"] = "" best_content = response.xpath("//div[@class='best-content']") if best_content: reply = WendaReplayItem() reply["title"] = wendaAskItem["title"] reply["username"] = best_content.xpath( ".//div[@class='qa-contributor']//span[@itemprop='accountName']/text()" ).extract()[0].strip() reply["images"] = [] reply["content"] = best_content.xpath( "string(.//div[@class='answer-text'])").extract()[0].strip() reply["addtime"] = best_content.xpath( ".//li[@class='timestamp']/abbr/@title").extract()[0] reply["source"] = wendaAskItem["source"] reply["headPortrait"] = best_content.xpath( ".//p[@class='user-avatar']/a/img/@src").extract()[0] reply["likes"] = best_content.xpath( ".//div[@class='qa-vote']/a/em/text()").extract()[0] wendaAskItem["askList"].append(reply) reply_list = response.xpath("//ul[@class='qa-answer-list']") if reply_list: replys = reply_list.xpath("./li[@class='answer-item']") for reply in replys: wendaReply = WendaReplayItem() wendaReply["title"] = wendaAskItem["title"] wendaReply["username"] = reply.xpath( ".//ul[@class='qa-meta']/li[@class='username']//span/text()" ).extract()[0] wendaReply["images"] = [] wendaReply["content"] = reply.xpath( ".//div[@class='answer-text']/text()").extract()[0].strip( ) wendaReply["addtime"] = reply.xpath( ".//ul[@class='qa-meta']/li[@class='timestamp']/abbr/@title" ).extract()[0] wendaReply["source"] = wendaAskItem["source"] wendaReply["headPortrait"] = reply.xpath( "./ul[@class='qa-meta']/li[@class='useravatar']/a/img/@src" ).extract()[0] wendaReply["likes"] = reply.xpath( "./a[@class='qa-answer-list-vote']/span[@class='n']/em/text()" ).extract()[0] wendaAskItem["askList"].append(wendaReply) pagejump = response.xpath("//div[@class='pagejump']") if len(pagejump) > 0: pages = response.xpath("//div[@class='pagejump']/a") for page in pages: if page.xpath("./text()").extract()[0] == "下一页": next_page_url = page.xpath("./@href").extract()[0] yield SplashRequest(response.urljoin(next_page_url), self.parse_next_ask, args={'wait': 1}, meta={ 'origin_url': response.urljoin(next_page_url), 'wendaAskItem': wendaAskItem }) else: yield wendaAskItem
def parse_zhidao(self, response): ask = WendaAskItem() ask["tagName"] = self.keyword if response.xpath("//meta[@name='keywords']"): ask["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] else: ask["keyword"] = "" if response.xpath("//meta[@name='description']"): ask["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] else: ask["description"] = "" ask["title"] = response.xpath( "//article[@id='qb-content']//span[@class='ask-title']/text()" ).extract()[0] ask["images"] = [] image_wrap = response.xpath( "//article[@id='qb-content']//div[@class='q-img-wp']") if (len(image_wrap) > 0): ask["images"] = image_wrap.xpath("//img/@src").extract() ask["content"] = "" ask["addtime"] = "" ask["source"] = response.meta["origin_url"] ask["username"] = "" ask["headPortrait"] = "" ask["askList"] = [] ask["topicUrl"] = "" answers = response.xpath( "//div[@class='bd-wrap']/div[contains(@class, 'answer')]") for answer in answers: reply = WendaReplayItem() reply["title"] = ask["title"] username = answer.xpath( ".//div[@class='wgt-replyer-all']//a[@class='reply-user-tohometip'][2]/span" ) if (len(username) > 0): reply["username"] = answer.xpath( ".//div[@class='wgt-replyer-all']//a[@class='reply-user-tohometip'][2]/span/text()" ).extract()[0] reply["images"] = [] best_text = answer.xpath( ".//div[@class='line content']/div[contains(@class, 'best-text')]" ) if (len(best_text) > 0): ptags = best_text.xpath(".//p") text = [] for p in ptags: text.append(p.xpath("string()").extract()[0].strip()) reply["content"] = "<br>".join(text) else: reply["content"] = answer.xpath( "string(.//div[@class='line content']/div[contains(@class, 'answer-text')])" ).extract()[0].strip() reply["addtime"] = answer.xpath( ".//span[@class='wgt-replyer-all-time']/text()").extract( )[0] reply["source"] = ask["source"] avatar_url = answer.xpath( ".//div[@class='wgt-replyer-all-avatar']/@style").extract( )[0] result = re.search( r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", avatar_url) reply["headPortrait"] = result.group() good = answer.xpath( ".//div[@class='wgt-eva']/span[contains(@class, 'evaluate-good-3')]" ) if good: reply["likes"] = answer.xpath( ".//div[@class='wgt-eva']/span[contains(@class, 'evaluate-good-3')]/@data-evaluate" ).extract()[0] else: reply["likes"] = 0 ask["askList"].append(reply) yield ask
def parse_question_page(self, response): huatiContent = response.meta["huatiContent"] wenda = WendaAskItem() wenda["tagName"] = self.keyword wenda["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] wenda["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] wenda["title"] = response.xpath( "//div[@class='QuestionHeader']//h1[@class='QuestionHeader-title']/text()" ).extract()[0] if response.xpath( "//div[@class='QuestionHeader']//div[contains(@class, 'QuestionRichText')]//p" ): wenda["content"] = response.xpath( "string(//div[@class='QuestionHeader']//div[contains(@class, 'QuestionRichText')]//p)" ).extract()[0] else: wenda["content"] = "" wenda["description"] = "" wenda["images"] = [] wenda["source"] = response.meta["origin_url"] wenda["username"] = "" wenda["headPortrait"] = "" wenda["askList"] = [] wenda["addtime"] = "" wenda["topicUrl"] = response.meta["topic_url"] visits = response.xpath( "//div[@class='QuestionFollowStatus']//strong[@class='NumberBoard-itemValue']//@title" ).extract()[0] replys = response.xpath("//div[@class='List-item']") for reply in replys: replyItem = WendaReplayItem() replyItem["title"] = wenda["title"] common_username = reply.xpath( ".//div[@class='AuthorInfo-content']//a[@class='UserLink-link']" ) if common_username: replyItem["username"] = reply.xpath( ".//div[@class='AuthorInfo-content']//span//text()" ).extract()[0] else: userLink = reply.xpath( ".//div[@class='AuthorInfo-content']//a[@class='UserLink-link']" ) if userLink: replyItem["username"] = reply.xpath( ".//div[@class='AuthorInfo-content']//a[@class='UserLink-link']//text()" ).extract()[0] else: replyItem["username"] = reply.xpath( ".//div[@class='AuthorInfo-contet']//span//text()" ).extract()[0] replyItem["images"] = [] text = [] ptags = reply.xpath( ".//div[contains(@class, 'RichContent')]//span[contains(@class, 'RichText')]/p" ) for p in ptags: text.append(p.xpath("string()").extract()[0]) replyItem["content"] = '<br>'.join(text) _, addtime = reply.xpath( ".//div[@class='ContentItem-time']//span/@data-tooltip" ).extract()[0].split("发布于 ") replyItem["addtime"] = addtime replyItem["source"] = response.meta["origin_url"] replyItem["headPortrait"] = reply.xpath( ".//span[@class='UserLink AuthorInfo-avatarWrapper']//img/@src" ).extract()[0] _, likes = reply.xpath( ".//div[contains(@class, 'ContentItem-actions')]/span/button[contains(@class, 'VoteButton--up')]/@aria-label" ).extract()[0].split("赞同 ") replyItem["likes"] = likes wenda["askList"].append(replyItem) huatiContent["content"] = wenda huatiContent["visits"] = visits yield huatiContent
def parse_wenda(self, response): ask = WendaAskItem() ask["tagName"] = self.keyword ask["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] ask["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] ask["title"] = response.xpath( "//span[@class='detail-tit']/text()").extract()[0] ask["images"] = [] ask["content"] = "" ask["addtime"] = "" ask["source"] = response.meta["origin_url"] ask["username"] = "" ask["headPortrait"] = "" ask["askList"] = [] ask["topicUrl"] = "" replys = response.xpath("//div[@class='replay-section answer_item']") print(replys) if len(replys) > 0: for reply in replys: replyItem = WendaReplayItem() replyItem["title"] = ask["title"] if reply.xpath(".//a[@class='user-name']"): replyItem["username"] = reply.xpath( ".//a[@class='user-name']/text()").extract()[0] else: replyItem["username"] = reply.xpath( ".//span[@class='user-name']/text()").extract()[0] replyItem["images"] = [] replyItem["content"] = reply.xpath( "string(.//pre[contains(@class, 'answer_con')])").extract( )[0].strip() replyItem["addtime"] = reply.xpath( ".//div[@class='user-txt']/text()").extract()[0].split( " 回答")[0] replyItem["source"] = response.meta["origin_url"] if reply.xpath(".//a[@class='user-thumb']"): replyItem["headPortrait"] = reply.xpath( ".//a[@class='user-thumb']/img/@src").extract()[0] else: replyItem["headPortrait"] = reply.xpath( ".//div[@class='user-thumb-box']//img/@src").extract( )[0] if reply.xpath(".//div[@class='ft-btn-box']/a"): replyItem["likes"] = reply.xpath( ".//div[@class='ft-btn-box']/a[1]/@data-num").extract( )[0] else: replyItem["likes"] = 0 ask["askList"].append(replyItem) yield ask
def parse_qa(self, response): wendaAskItem = WendaAskItem() wendaAskItem["tagName"] = self.keyword if response.xpath("//meta[@name='keywords']"): wendaAskItem["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] else: wendaAskItem["keyword"] = "" wendaAskItem["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] wendaAskItem["title"] = response.xpath( "//h1[@class='audio-intro-h1']/text()").extract()[0] wendaAskItem["images"] = [] wendaAskItem["content"] = response.xpath( "//div[@class='areat-m']/div[contains(@class, 'audio-intro-main')][1]/p/text()" ).extract()[0].strip() wendaAskItem["addtime"] = response.xpath( "//div[contains(@class, 'intro-ts')]/span[contains(@class, 'date')]/text()" ).extract()[0] wendaAskItem["source"] = response.meta["origin_url"] wendaAskItem["username"] = "" wendaAskItem["headPortrait"] = "" wendaAskItem["askList"] = [] wendaAskItem["topicUrl"] = "" replyItem = WendaReplayItem() replyItem["title"] = wendaAskItem["title"] replyItem["username"] = response.xpath( "//div[contains(@class, 'audio-intro-l')]/a[@class='a']/span/text()" ).extract()[0] replyItem["images"] = [] replyItem["content"] = response.xpath( "//div[@class='areat-m']/div[contains(@class, 'audio-intro-main')][2]/p/text()" ).extract()[0].strip() replyItem["addtime"] = wendaAskItem["addtime"] replyItem["source"] = response.meta["origin_url"] replyItem["headPortrait"] = response.xpath( "//div[contains(@class, 'audio-intro-l')]/a[@class='a']/img/@src" ).extract()[0] replyItem["likes"] = 0 wendaAskItem["askList"].append(replyItem) otherReplys = response.xpath("//div[@class='ask-mod-item']") for otherReply in otherReplys: reply = WendaReplayItem() reply["title"] = wendaAskItem["title"] reply["username"] = otherReply.xpath( ".//div[@class='part-left']/a[@class='a']/@title").extract()[0] reply["images"] = [] reply["content"] = otherReply.xpath( ".//div[contains(@class, 'audio-intro-main')]/p/text()" ).extract()[0].strip() reply["addtime"] = wendaAskItem["addtime"] reply["source"] = response.meta["origin_url"] reply["headPortrait"] = response.xpath( ".//div[@class='part-left']/a[@class='a']/img/@src").extract( )[0] reply["likes"] = 0 wendaAskItem["askList"].append(reply) yield wendaAskItem
def parse_ask(self, response): wendaAskItem = WendaAskItem() wendaAskItem["tagName"] = self.keyword wendaAskItem["keyword"] = response.xpath( "//meta[@name='keywords']/@content").extract()[0] wendaAskItem["description"] = response.xpath( "//meta[@name='description']/@content").extract()[0] wendaAskItem["title"] = response.xpath( "//h1[@id='d_askH1']/text()").extract()[0] wendaAskItem["images"] = [] wendaAskItem["content"] = response.xpath( "string(//div[@class='b_askcont']/p[@class='crazy_new'])").extract()[0].strip() wendaAskItem["addtime"] = response.xpath( "//div[@class='b_askab1']//span[2]/text()").extract()[0] wendaAskItem["source"] = response.meta["origin_url"] if response.xpath( "//var[@class='ask_Author']"): wendaAskItem["username"] = response.xpath( "//var[@class='ask_Author']/text()").extract()[0] else: wendaAskItem["username"] = "******" wendaAskItem["headPortrait"] = "" wendaAskItem["askList"] = [] wendaAskItem["topicUrl"] = "" reply_list = response.xpath( "//div[contains(@class, 'b_answerbox')]/div[@class='b_answerli']") if reply_list: for reply in reply_list: wendaReply = WendaReplayItem() wendaReply["title"] = wendaAskItem["title"] if(reply.xpath( ".//div[contains(@class, 'b_answertop')]//span[@class='b_sp1']/a")): wendaReply["username"] = reply.xpath( ".//div[contains(@class, 'b_answertop')]//span[@class='b_sp1']/a/text()").extract()[0] else: wendaReply["username"] = reply.xpath( ".//div[contains(@class, 'b_answertop')]//span[@class='b_sp1']/var/text()").extract()[0] wendaReply["images"] = [] if(reply.xpath( ".//div[contains(@class, 'b_answercont')]//div[@class='crazy_new']/p")): wendaReply["content"] = reply.xpath( ".//div[contains(@class, 'b_answercont')]//div[@class='crazy_new']/p/text()").extract()[0].strip() else: wendaReply["content"] = "" if reply.xpath(".//div[contains(@class, 'b_answercont')]//span[@class='b_anscont_time']"): wendaReply["addtime"] = reply.xpath( ".//div[contains(@class, 'b_answercont')]//span[@class='b_anscont_time']/text()").extract()[0].strip() else: wendaReply["addtime"] = "" wendaReply["source"] = wendaAskItem["source"] if reply.xpath( "./div[contains(@class, 'b_answertop')]/a[@class='b_docface']"): wendaReply["headPortrait"] = reply.xpath( "./div[contains(@class, 'b_answertop')]/a[@class='b_docface']/img/@src").extract()[0] else: wendaReply["headPortrait"] = reply.xpath( "./div[contains(@class, 'b_answertop')]/var[@class='b_docface']/img/@src").extract()[0] info = reply.xpath( "string(./div[contains(@class, 'b_answertop')]/div[@class='b_answertl']/span[@class='b_sp2'][2])").extract()[0] if info: result = re.search(r"\d+\.?\d*", info) if result and result.group(): wendaReply["likes"] = result.group() else: wendaReply["likes"] = 0 else: wendaReply["likes"] = 0 wendaAskItem["askList"].append(wendaReply) yield wendaAskItem