예제 #1
0
 def parse_content(self, response):
     item = LcItem()
     content = hide_and_sub(response, '//div[@class="detail_cnt"]', "</p>")
     title = response.xpath(
         '//div[@class="detail_cnt"]/h4/text()').extract_first()
     item["content"] = content
     item["title"] = title
     item["create_time"] = response.xpath(
         '//div[@class="detail_cnt"]//div[@class="para"]/span[1]/text()'
     ).extract_first()
     print(response.url, "==============")
     if response.xpath(
             '//div[@class="detail_cnt"]//div[@class="para"]/span[2]/text()'
     ):
         item["author"] = response.xpath(
             '//div[@class="detail_cnt"]//div[@class="para"]/span[2]/text()'
         ).extract_first().replace("作者:", "")
         item["source"] = "壹生"
         item["gid"] = parse_title(title, item["source"])
         item["key_word"] = ",".join(
             response.xpath(
                 '//div[@class="detail_cnt"]//div[@class="label"]//span//text()'
             ).extract())
         item["wxname"] = "壹生资讯"
         if item["gid"] not in self.data_zxs:
             yield item
예제 #2
0
 def parse_content(self, response):
     item = LcItem()
     datas = response.xpath(
         '//div[@class="article-header"]/div[1]/div[1]/text()'
     ).extract_first()
     title = response.xpath(
         '//div[@class="article-header"]/h1/text()').extract_first()
     sourse = re.compile('来源:(.*?)/').findall(datas)[0].strip()
     date = "-".join(re.compile('(\d+)-(\d+)-(\d+)').findall(datas)[0])
     author = re.compile('/ 作者:(.*?)/').findall(datas)
     if author:
         author = author[0].strip()
     else:
         author = ""
     item["content"] = hide_and_sub(response,
                                    '//div[@class="article-content"]',
                                    "</p>")
     item["title"] = title
     item["source"] = sourse
     item["gid"] = parse_title(title, sourse)
     item["author"] = author
     item["create_time"] = date
     item["key_word"] = response.meta["kw"]
     item["wxname"] = response.meta["wn"]
     if item["gid"] not in self.ids and date.split("-")[0] == "2018":
         yield item
예제 #3
0
 def parse_video(self, response):
     item = LcItem()
     vido_url = " https://www.cmtopdr.com" + response.xpath(
         '//a[@id="jumpPlay"]/@href').extract_first()
     content = hide_and_sub(response, '//div[@class="tabs_cnt course"]',
                            "</p>")
     item["content"] = content.replace('==/">', '==">') + "\n" + vido_url
     item["title"] = response.meta["t"]
     item["create_time"] = response.meta["e"]
     item["author"] = response.meta["a"]
     item["source"] = "壹生视频教学"
     item["gid"] = parse_title(response.meta["t"], item["source"])
     item["key_word"] = response.meta["k"]
     item["wxname"] = "壹生视频"
     if item["gid"] not in self.data_sps:
         yield item
예제 #4
0
 def content_parse(self, response):
     with open("./ykd_bl.txt", "a") as fw:
         fw.write(response.url + "\n")
     aut = response.meta["aut"]
     item = LcItem()
     rsps = response.text.replace(u"\u44ec", u"").replace(u"\u301c", u"")
     datas = json.loads(re.sub(r"\s+", "", rsps))
     img_re = re.compile('<imgsrc="(.*?)"')
     if datas["errorCode"] == "0":
         try:
             data_list = datas["obj"]["medicalCase"]
             contents = json.dumps(data_list["abstractAbbr"],
                                   ensure_ascii=False).split('imgsrc')
             partter = r'\\"(.*?)"alt'
             img_url = img_re.findall(data_list["abstractAbbr"])
             contents_1 = [re.sub(partter, "", c) for c in contents]
             now_u = [du(u, data_list["id"]) for u in img_url if u]
             cz = len(contents_1) - len(now_u)
             for c in range(cz):
                 now_u.append(" ")
             n_content = [
                 "".join([str(j) for j in i])
                 for i in list(zip(contents_1, now_u))
             ]
             n_content = "".join(n_content).replace(
                 "<http:", "\n<img src=\http:").replace(r"==", '').replace(
                     '"图', 'alt="图').replace("\\", ' ')
             n_content = n_content.replace('src= h', 'src="h').replace(
                 ' alt', '" alt')
             item["content"] = (n_content + "\n作者讨论\n" +
                                data_list["comment"]).replace(
                                    "\n", "<br/>")
             item["create_time"] = data_list["pubDate"]
             item["title"] = data_list["medicalName"]
             item["author"] = aut
             item["key_word"] = ""
             item["source"] = "医口袋病例"
             item["gid"] = parse_title(item["title"], item["source"])
             if item["gid"] not in self.old_url:
                 yield item
         except:
             pass
예제 #5
0
 def parse_media(self, response):
     item = LcItem()
     if response.text:
         datas = response.xpath(
             '//div[@class="article-header"]/div[1]/div[1]/text()'
         ).extract_first()
         title = response.xpath(
             '//div[@class="article-header"]/h1/text()').extract_first()
         item["source"] = datas.split("/")[0]
         item["author"] = datas.split("/")[1]
         item["create_time"] = datas.split("/")[2]
         item["title"] = title
         need_html = "".join(
             response.xpath(
                 '//div[@class="article-content"]|//div[@class="multimedia"]'
             ).extract()).replace(
                 'src="/storage/media/',
                 'src="http://www.bio360.net/storage/media/')
         item["content"] = hide_and_sub(need_html, "*", "</p>")
         item["key_word"] = ""
         item["wxname"] = response.meta["wn"]
         item["gid"] = parse_title(title, item["source"])
         if item["gid"] not in self.ids:
             yield item