Пример #1
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath(
             '//div[@class="ask_title"]/h1').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath(
                 '//div[@class="ask_title"]/following-sibling::div[@class="wd_cont_s"][1]/p[1]'
             ).extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = []
             answers = response.xpath('//div[@class="angle"]')
             for answer_each in answers:
                 answer = answer_each.xpath(
                     './/following-sibling::p[1]').extract()[0]
                 answerList.append(self.filter_tags_blank(answer))
             item['answer'] = answerList
         except Exception as e:
             item['answer'] = []
             # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #2
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//h1[@id="d_askH1"]').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath(
                 '//p[@class="crazy_new"][1]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = response.xpath('//div[@class="b_anscont_cont"]')
             itemList = []
             for index, answerli in enumerate(answerList):
                 answer_each = answerli.xpath(
                     './/div[@class="crazy_new"]/p/text()').extract()
                 answer = "".join(answer_each)
                 itemList.append("".join(answer.split()))
             item['answer'] = itemList
         except Exception as e:
             item['answer'] = ''
         # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #3
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath(
             '//div[@class="iask_detail01a"]//ul').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath(
                 '//div[@class="iask_detail01b1"]/dl[2]/dd').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = []
             answers = response.xpath('//div[@class="iask_answer02a"]')
             for item_each in answers:
                 answerLi = item_each.xpath('.//dd').extract()[0]
                 answerList.append(self.filter_tags_blank(answerLi))
             item['answer'] = answerList
         except Exception as e:
             item['answer'] = ''
             # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #4
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//p[@class="ask_article_title_p1"]').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//p[@class="ask_article_nr1_p2"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = []
             answers = response.xpath('//div[@class="answer_content2_1"]')
             for item_each in answers:
                 answerList.append(self.filter_tags_blank(item_each.extract()))
             item['answer'] = answerList
         except Exception as e:
             item['answer'] = ''
             # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #5
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//h1[contains(@class, "fyahei")]').extract()[0]
         askTxt = self.filter_tags_blank(question)
         descText = ''
         # try:
         #     desc = response.xpath('//p[@class="pd_txt"]').extract()[0]
         #     descText = self.filter_tags_blank(desc)
         # except Exception as e:
         #     descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         # try:
         #     answer = response.xpath('//div[@class="an_cont"]/dl/dt').extract()[0]
         #     item['answer'] = self.filter_tags_blank(answer)
         # except Exception as e:
         #     item['answer'] = ''
         item['answer'] = ''
             # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #6
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//h1').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//div[@class="descip"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = response.xpath('//div[@class="dorawer"]/div[@class="descip paint1"]').extract()
             itemList = []
             for index, answerli in enumerate(answerList):
                 itemList.append(self.filter_tags_blank(answerli))
             item['answer'] = itemList
         except Exception as e:
             item['answer'] = ''
         # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #7
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath(
             '//h1[@class="four font-16 u_tit"]').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//p[@class="k_questiond"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = response.xpath(
                 '//div[@class="k_answerlist"]/div[@class="k_answerli"]')
             itemList = []
             for index, answerli in enumerate(answerList):
                 answer_each = answerli.xpath(
                     './/div[@class="crazy_new"]').extract()[0]
                 itemList.append(self.filter_tags_blank(answer_each))
             item['answer'] = itemList
         except Exception as e:
             item['answer'] = ''
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #8
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//dl[@class="iask13_title"]/dt').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//div[@class="iask13 iask13_q"]/ul[@class="iask13_con"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answerList = []
             answers = response.xpath('//div[@class="iask13 iask13_a"]')
             for item_each in answers:
                 ulList = item_each.xpath('.//ul[@class="iask13_con"]')
                 tempAnswer = ''
                 for item_ul in ulList:
                     tempAnswer = tempAnswer + self.filter_tags_blank(item_ul.extract())
                 answerList.append(tempAnswer)
             item['answer'] = answerList
         except Exception as e:
             item['answer'] = ''
         # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #9
0
 def parse_detail_mongo(self, response):
     print('==============')
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//div[@class="why"]/h1').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//p[@class="pd_txt"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answer = response.xpath(
                 '//div[@class="an_cont"]/dl/dt').extract()[0]
             item['answer'] = self.filter_tags_blank(answer)
         except Exception as e:
             item['answer'] = ''
         print(item)
         # yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #10
0
 def parse_detail(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         item['question'] = response.xpath(
             '//p[@class="crazy_new"]/text()').extract()[1].strip()
         answerList = response.xpath(
             '//div[@class="b_anscont_cont"][1]/div[@class="crazy_new"]/p/text()'
         ).extract()
         answer = "".join(answerList)
         item['answer'] = "".join(answer.split())
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #11
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//span[@class="title"]').extract()[0]
         # question = response.xpath('//h1').extract()[0]
         askText = self.filter_tags_blank(question)
         item['question'] = {'askText': askText, 'askDesc': ''}
         item['answer'] = ''
         # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #12
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//h1').extract()[0].replace(
             '<span>问</span>', '')
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath(
                 '//div[@class="wenti_dec"]/p').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         item['answer'] = ''
         # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)
Пример #13
0
 def parse_detail_mongo(self, response):
     item = CorpusHealthItem()
     try:
         item['url'] = response.url
         question = response.xpath('//p[@class="fl dib fb"]').extract()[0]
         askTxt = self.filter_tags_blank(question)
         try:
             desc = response.xpath('//div[@id="qdetailc"]').extract()[0]
             descText = self.filter_tags_blank(desc)
         except Exception as e:
             descText = ''
         item['question'] = {'askText': askTxt, 'askDesc': descText}
         try:
             answer = response.xpath('//div[@class="pt15 f14 graydeep  pl20 pr20"]').extract()[0]
             item['answer'] = self.filter_tags_blank(answer)
         except Exception as e:
             item['answer'] = ''
             # print(item)
         # print(item['answer'])
         yield item
     except Exception as e:
         print(e)
         logger.info("匹配信息出错。错误原因:")
         logger.info(e)