Exemplo n.º 1
0
 def get_post(self, item):
     if item.get_info("sourceUrl").split(".")[-1] == "pdf":
         return
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//div[@class="xxxq_text_tit"][1]/h6/span[2]')[0]
         source_date = ["深圳市卫生健康委员会", source_date.text.replace("发布日期:", "")]
     except Exception as e:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     body = []
     for p in xml.xpath('//div[@class="TRS_Editor"]/p'):
         if p.text:
             body.append(p.text)
         else:
             continue
     date = source_date[1]
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[0],
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)
Exemplo n.º 2
0
 def get_post(self, item):
     xml = Download(item.get_info("sourceUrl")).request()
     if xml is False:
         return
     try:
         source_date = xml.xpath(
             '//p[@class="margin_top15 c999999 text_cencer"]')[0].text
     except Exception:
         print_info("{} 解析失败".format(item.get_info("sourceUrl")))
         return
     source_date = source_date.split(" ")
     body = []
     for p in xml.xpath('//div[@class="content-content"]/p'):
         if p.text:
             body.append(p.text)
     date = "{} {}".format(source_date[0].replace("时间:", ""),
                           source_date[1])
     update_info = {
         "date": date,
         "_id": generate_hash("{}{}".format(item.get_info("title"), date)),
         "source": source_date[3].replace("来源:", ""),
         "body": "\n".join(body),
         "effective": True
     }
     item.set_info(update_info)
Exemplo n.º 3
0
 def get_post_list(self, url, items):
     xml = Download(url).request()
     if xml is False:
         return
     lis = xml.xpath('//div[@class="section list"][1]/ul/li')
     for li in lis:
         a = li.find("a")
         span = li.find("span")
         if self.url_repeat(a.get("href")) is False:
             item = GDWJWItem()
             item.set_info({
                 "title":
                 a.get("title"),
                 "sourceUrl":
                 a.get("href"),
                 "_id":
                 generate_hash("{}{}".format(a.get("title"), span.text)),
                 "agency":
                 "广东省卫健委",
                 "date":
                 span.text,
                 "effective":
                 True
             })
             items.append(item)
Exemplo n.º 4
0
 def get_page_num(self):
     xml = Download(self._start_url).request()
     if xml is False:
         return 1
     js_func = xml.xpath('//div[@class="zx_ml_list_page"]/script/text()')[0]
     js_func = js_func.replace("createPageHTML(", "").replace(");", "")
     return int(js_func.split(",")[0])
Exemplo n.º 5
0
 def get_page_num(self):
     xml = Download(self._start_url).request()
     if xml is False:
         return 1
     last_url = xml.xpath('//a[@class="last"]')[0].xpath("@href")[0]
     html_names = re.findall(pattern=r"index_[\d]*.html", string=last_url)
     if len(html_names) >= 1:
         pages_num = int(html_names[0].replace("index_",
                                               "").replace(".html", ""))
         return pages_num
     else:
         return 1
Exemplo n.º 6
0
    def get_post(self, item):
        xml = Download(item.get_info("sourceUrl")).request()
        if xml is False:
            return
        bodys = []
        try:
            lis = xml.xpath('//div[@class="check_content_points"]/ul/li')
            if len(lis) > 1:
                for li in lis:
                    if li.find("span").tail:
                        bodys.append(li.find("span").tail)
            else:
                bodys.append(lis[0].text)
        except Exception:
            print_info("解析错误:{}".format(item.get_info("sourceUrl")))
            return

        item.set_info({"body": "\n".join(bodys)})
Exemplo n.º 7
0
 def get_post_list(self, url, items):
     xml = Download(url).request()
     if xml is False:
         return
     lis = xml.xpath('//div[@class="wendangListC"][1]//li')
     for li in lis:
         date = li.find("strong").text
         a = li.find("a")
         post_url = re.sub("^\.", "http://wjw.sz.gov.cn/yqxx",
                           a.get("href"))
         if self.url_repeat(post_url) is False:
             item = SZWJWItem()
             item.set_info({
                 "title": a.text,
                 "sourceUrl": post_url,
                 "_id": generate_hash("{}{}".format(a.text, date)),
                 "agency": "深圳卫健委",
                 "date": date,
                 "effective": True,
                 "source": "深圳市卫生健康委员会"
             })
             items.append(item)