def _parse_meta(self): content = self._parse_content_tag() meta_vypis_tags = content.find("p", {"class": "meta-vypis"}) if not meta_vypis_tags: return meta_vypis_tag = first(meta_vypis_tags) has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"}) if has_tux_tags: self.has_tux = True # get clean string - another thing which is not semantic at all lines = dhtmlparser.removeTags(meta_vypis_tag) self.created_ts = parse_timestamp(lines) # rest will be picked one by one lines = lines.strip().splitlines() # parse last modification time modified_ts_line = [x for x in lines if "poslední úprava:" in x] if modified_ts_line: date_string = first(modified_ts_line).split(": ")[-1] self.last_modified_ts = parse_timestamp(date_string) # parse number of reads reads_line = [x for x in lines if "Přečteno:" in x] if reads_line: reads = first(reads_line).split(":")[-1].split("&")[0] self.readed = int(reads)
def _izolate_timestamp(head_tag): text_elements = head_tag.find(None, fn=lambda x: not x.isTag()) text_clusters = [str(x).splitlines() for x in text_elements] lines = sum(text_clusters, []) # flattern the list return parse_timestamp(lines)
def from_html(html, lazy=True): """ Convert HTML string to :class:`Blogpost` instance. Args: html (str): Input data. lazy (bool, default True): Be lazy (don't pull data by yourself from the site). Call :meth:`pull` for active download of all required informations. Returns: obj: :class:`Blogpost` instance. """ if not isinstance(html, dhtmlparser.HTMLElement): html = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(html) # support for legacy blogs title_tag = html.find("h2", {"class": "st_nadpis"}) if title_tag: title_tag = first(title_tag) rel_link = first(title_tag.find("a")).params["href"] link = url_context(rel_link) else: title_tag = first(html.find("h2")) link = first(html.find("link", {"rel": "canonical"})) link = link.params["href"] title = dhtmlparser.removeTags(title_tag).strip() # get meta meta = html.find("p", {"class": "meta-vypis"})[0] blog = Blogpost(url=link, lazy=lazy) if lazy: blog.title = title blog.intro = Blogpost._parse_intro(html, meta, title_tag) blog.rating = Blogpost._parse_rating_from_preview(meta) blog.created_ts = parse_timestamp(meta) blog.comments_n = Blogpost._parse_comments_n(meta) return blog