Exemplo n.º 1
0
 def extract_content(self, param, clean=None, before=None, after=None):
     params = {"params": {"selector": "div#eData > dl"}, "method": "select"}
     img_params = {
         "params": {
             "selector": "dd:nth-of-type(1)"
         },
         "method": "select"
     }
     text_params = {
         "params": {
             "selector": "dd:nth-of-type(5)"
         },
         "method": "select"
     }
     dls = find_tags(self._soup, params)
     content = list()
     for dl in dls:
         image_tag = find_tag(dl, img_params)
         if not image_tag:
             continue
         url = extract_tag_attribute(image_tag, "text")
         if not (url and url.startswith("http") and url.endswith(".jpg")):
             continue
         content.append({"tag": "img", "src": url})
         text_tag = find_tag(dl, text_params)
         if not text_tag:
             continue
         text = extract_tag_attribute(text_tag, "text")
         if not text:
             continue
         text = text.replace("<br />", " ").strip()
         content.append({"tag": "p", "text": text})
     return self.clean_content(content)
Exemplo n.º 2
0
 def extract_content(self, param, clean=None, before=None, after=None):
     roots = list()
     if param is None:
         roots.append(self._find_content_tag())
     elif isinstance(param, list):
         for p in param:
             root = find_tag(self._soup, p)
             if root is None or root in roots:
                 continue
             else:
                 roots.append(root)
     else:
         roots.append(find_tag(self._soup, param))
     content = list()
     for root in roots:
         if root is None:
             continue
         if before is not None:
             self._clean_content_before(root, before)
         if after is not None:
             self._clean_content_after(root, after)
         if clean is not None:
             self._clean_content(root, clean)
         if root.name == "textarea":
             content.extend(self.parse_text_area(root))
         else:
             content.extend(self.parse_content(root))
     return self.clean_content(content)
Exemplo n.º 3
0
 def judge_missing(self, param):
     if param is None:
         return False
     if isinstance(param, list):
         for p in param:
             if find_tag(self.soup, p):
                 return True
     else:
         if find_tag(self.soup, param):
             return True
     return False
Exemplo n.º 4
0
 def _clean_content_after(self, root, param):
     tag = find_tag(root, param)
     if tag is not None:
         siblings = []
         for sibling in tag.next_siblings:
             siblings.append(sibling)
         for sibling in siblings:
             sibling.extract()
         tag.extract()
Exemplo n.º 5
0
 def _clean_content_before(self, root, param):
     tag = find_tag(root, param)
     if tag is not None:
         siblings = []
         for sibling in tag.previous_siblings:
             siblings.append(sibling)
         for sibling in siblings:
             sibling.extract()
         tag.extract()
Exemplo n.º 6
0
 def clean_author(self, author):
     param = {
         "method": "select",
         "params": {
             "selector": "div#news_template_03_AuthorAndTime > span"
         }
     }
     tag = find_tag(root=self.soup, param=param)
     text = extract_tag_attribute(root=tag)
     return author.replace(text, "")
Exemplo n.º 7
0
 def _extract_tag(root, param):
     """
     param = {
         "method": "find_all",
         "params": {},
         "nth": 0,
         "attribute": "text",
     }
     :param root:
     :type root:
     :param param:
     :type param:
     :return:
     :rtype:
     """
     tag = find_tag(root, param)
     attribute = param.get("attribute")
     if attribute is None:
         string = extract_tag_attribute(tag, "text")
     else:
         string = extract_tag_attribute(tag, attribute)
     return string
Exemplo n.º 8
0
 def find_tag_extract_attribute(root, params):
     tag = find_tag(root, params)
     if not tag:
         return None
     return extract_tag_attribute(tag, params.get("attribute", "text"))