def parse_content(self, response): item = LcItem() content = hide_and_sub(response, '//div[@class="detail_cnt"]', "</p>") title = response.xpath( '//div[@class="detail_cnt"]/h4/text()').extract_first() item["content"] = content item["title"] = title item["create_time"] = response.xpath( '//div[@class="detail_cnt"]//div[@class="para"]/span[1]/text()' ).extract_first() print(response.url, "==============") if response.xpath( '//div[@class="detail_cnt"]//div[@class="para"]/span[2]/text()' ): item["author"] = response.xpath( '//div[@class="detail_cnt"]//div[@class="para"]/span[2]/text()' ).extract_first().replace("作者:", "") item["source"] = "壹生" item["gid"] = parse_title(title, item["source"]) item["key_word"] = ",".join( response.xpath( '//div[@class="detail_cnt"]//div[@class="label"]//span//text()' ).extract()) item["wxname"] = "壹生资讯" if item["gid"] not in self.data_zxs: yield item
def parse_content(self, response): item = LcItem() datas = response.xpath( '//div[@class="article-header"]/div[1]/div[1]/text()' ).extract_first() title = response.xpath( '//div[@class="article-header"]/h1/text()').extract_first() sourse = re.compile('来源:(.*?)/').findall(datas)[0].strip() date = "-".join(re.compile('(\d+)-(\d+)-(\d+)').findall(datas)[0]) author = re.compile('/ 作者:(.*?)/').findall(datas) if author: author = author[0].strip() else: author = "" item["content"] = hide_and_sub(response, '//div[@class="article-content"]', "</p>") item["title"] = title item["source"] = sourse item["gid"] = parse_title(title, sourse) item["author"] = author item["create_time"] = date item["key_word"] = response.meta["kw"] item["wxname"] = response.meta["wn"] if item["gid"] not in self.ids and date.split("-")[0] == "2018": yield item
def parse_video(self, response): item = LcItem() vido_url = " https://www.cmtopdr.com" + response.xpath( '//a[@id="jumpPlay"]/@href').extract_first() content = hide_and_sub(response, '//div[@class="tabs_cnt course"]', "</p>") item["content"] = content.replace('==/">', '==">') + "\n" + vido_url item["title"] = response.meta["t"] item["create_time"] = response.meta["e"] item["author"] = response.meta["a"] item["source"] = "壹生视频教学" item["gid"] = parse_title(response.meta["t"], item["source"]) item["key_word"] = response.meta["k"] item["wxname"] = "壹生视频" if item["gid"] not in self.data_sps: yield item
def content_parse(self, response): with open("./ykd_bl.txt", "a") as fw: fw.write(response.url + "\n") aut = response.meta["aut"] item = LcItem() rsps = response.text.replace(u"\u44ec", u"").replace(u"\u301c", u"") datas = json.loads(re.sub(r"\s+", "", rsps)) img_re = re.compile('<imgsrc="(.*?)"') if datas["errorCode"] == "0": try: data_list = datas["obj"]["medicalCase"] contents = json.dumps(data_list["abstractAbbr"], ensure_ascii=False).split('imgsrc') partter = r'\\"(.*?)"alt' img_url = img_re.findall(data_list["abstractAbbr"]) contents_1 = [re.sub(partter, "", c) for c in contents] now_u = [du(u, data_list["id"]) for u in img_url if u] cz = len(contents_1) - len(now_u) for c in range(cz): now_u.append(" ") n_content = [ "".join([str(j) for j in i]) for i in list(zip(contents_1, now_u)) ] n_content = "".join(n_content).replace( "<http:", "\n<img src=\http:").replace(r"==", '').replace( '"图', 'alt="图').replace("\\", ' ') n_content = n_content.replace('src= h', 'src="h').replace( ' alt', '" alt') item["content"] = (n_content + "\n作者讨论\n" + data_list["comment"]).replace( "\n", "<br/>") item["create_time"] = data_list["pubDate"] item["title"] = data_list["medicalName"] item["author"] = aut item["key_word"] = "" item["source"] = "医口袋病例" item["gid"] = parse_title(item["title"], item["source"]) if item["gid"] not in self.old_url: yield item except: pass
def parse_media(self, response): item = LcItem() if response.text: datas = response.xpath( '//div[@class="article-header"]/div[1]/div[1]/text()' ).extract_first() title = response.xpath( '//div[@class="article-header"]/h1/text()').extract_first() item["source"] = datas.split("/")[0] item["author"] = datas.split("/")[1] item["create_time"] = datas.split("/")[2] item["title"] = title need_html = "".join( response.xpath( '//div[@class="article-content"]|//div[@class="multimedia"]' ).extract()).replace( 'src="/storage/media/', 'src="http://www.bio360.net/storage/media/') item["content"] = hide_and_sub(need_html, "*", "</p>") item["key_word"] = "" item["wxname"] = response.meta["wn"] item["gid"] = parse_title(title, item["source"]) if item["gid"] not in self.ids: yield item