Python extract_tag_attribute示例，html.extract_tag_attribute Python示例

示例#1

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def extract_content(self, param, clean=None, before=None, after=None):
     params = {"params": {"selector": "div#eData > dl"}, "method": "select"}
     img_params = {
         "params": {
             "selector": "dd:nth-of-type(1)"
         },
         "method": "select"
     }
     text_params = {
         "params": {
             "selector": "dd:nth-of-type(5)"
         },
         "method": "select"
     }
     dls = find_tags(self._soup, params)
     content = list()
     for dl in dls:
         image_tag = find_tag(dl, img_params)
         if not image_tag:
             continue
         url = extract_tag_attribute(image_tag, "text")
         if not (url and url.startswith("http") and url.endswith(".jpg")):
             continue
         content.append({"tag": "img", "src": url})
         text_tag = find_tag(dl, text_params)
         if not text_tag:
             continue
         text = extract_tag_attribute(text_tag, "text")
         if not text:
             continue
         text = text.replace("<br />", " ").strip()
         content.append({"tag": "p", "text": text})
     return self.clean_content(content)

示例#2

0

显示文件

 def get_ol_tag(tag):
     type_ = extract_tag_attribute(tag, "type")
     reverse = extract_tag_attribute(tag, "reversed")
     attributes = dict()
     if type_:
         attributes["type"] = type_
     if reverse:
         attributes["reversed"] = reverse
     return attributes

示例#3

0

显示文件

 def get_video_tag(self, tag):
     src = extract_tag_attribute(tag, "src")
     controls = extract_tag_attribute(tag, "controls")
     poster = extract_tag_attribute(tag, "poster")
     attributes = dict()
     if src:
         attributes["src"] = src
     if controls:
         attributes["controls"] = controls
     if poster:
         attributes["poster"] = poster
     return attributes

示例#4

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def extract_content(self, param, clean=None, before=None, after=None):
     content = list()
     first_text = {
         "params": {
             "selector": "p.explain"
         },
         "method": "select",
         "attribute": "text"
     }
     text = self.find_tag_extract_attribute(self._soup, first_text)
     if text:
         content.append({"tag": "p", "text": text})
     image_list = {
         "params": {
             "selector": "div.roll > div > ul > li  img"
         },
         "method": "select",
     }
     for a in find_tags(self._soup, image_list):
         src = extract_tag_attribute(a, "src")
         if src:
             content.append({"tag": "img", "src": src})
     for item in content:
         if item["tag"] == "img":
             item["src"] = item["src"].replace("stn", "n")
     return content

示例#5

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def clean_author(self, author):
     param = {
         "method": "select",
         "params": {
             "selector": "div#news_template_03_AuthorAndTime > span"
         }
     }
     tag = find_tag(root=self.soup, param=param)
     text = extract_tag_attribute(root=tag)
     return author.replace(text, "")

示例#6

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def get_gallery(self):
     param = {
         "method": "select",
         "params": {
             "selector": "div#tt-slide div.img-wrap"
         },
     }
     tags = find_tags(self.soup, param=param)
     images = list()
     for tag in tags:
         src = extract_tag_attribute(tag, name="data-src")
         images.append(src)
     return images

示例#7

0

显示文件

 def _extract_tag(root, param):
     """
     param = {
         "method": "find_all",
         "params": {},
         "nth": 0,
         "attribute": "text",
     }
     :param root:
     :type root:
     :param param:
     :type param:
     :return:
     :rtype:
     """
     tag = find_tag(root, param)
     attribute = param.get("attribute")
     if attribute is None:
         string = extract_tag_attribute(tag, "text")
     else:
         string = extract_tag_attribute(tag, attribute)
     return string

示例#8

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def extract_content(self, param, clean=None, before=None, after=None):
     content = list()
     top_pic = {
         "params": {
             "selector": "div.region-inner > figure > img"
         },
         "method": "select",
         "attribute": "src"
     }
     pic_url = self.find_tag_extract_attribute(self._soup, top_pic)
     if pic_url:
         pic_url = urljoin(self.url, pic_url)
         content.append({"tag": "img", "src": pic_url})
     body_items = {
         "params": {
             "selector": "div.body-copy > div"
         },
         "method": "select",
     }
     img_src = {
         "params": {
             "selector": "img"
         },
         "method": "select",
         "attribute": "src"
     }
     for a in find_tags(self._soup, body_items):
         src = self.find_tag_extract_attribute(a, img_src)
         if src:
             src = urljoin(self.url, src)
             content.append({"tag": "img", "src": src})
         else:
             text = extract_tag_attribute(a, "text")
             if text.replace("&nbsp;", "").strip() != "":
                 content.append({"tag": "p", "text": text})
     return content

示例#9

0

显示文件

文件： extractor.py 项目： zhidu-qidian/Spiders

 def find_tag_extract_attribute(root, params):
     tag = find_tag(root, params)
     if not tag:
         return None
     return extract_tag_attribute(tag, params.get("attribute", "text"))

示例#10

0

显示文件

 def get_object_tag(tag):
     data = extract_tag_attribute(tag, "data")
     attributes = dict()
     if data:
         attributes["data"] = data
     return attributes