Exemplo n.º 1
0
def test_remove_tags():
    dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d")
    assert dhtmlparser.removeTags(dom) == "axaxxexid"

    dom = dhtmlparser.parseString("<b></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><i></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><!-- asd --><i></b>")
    assert not dhtmlparser.removeTags(dom)
Exemplo n.º 2
0
def test_remove_tags():
    dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d")
    assert dhtmlparser.removeTags(dom) == "axaxxexid"

    dom = dhtmlparser.parseString("<b></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><i></b>")
    assert not dhtmlparser.removeTags(dom)

    dom = dhtmlparser.parseString("<b><!-- asd --><i></b>")
    assert not dhtmlparser.removeTags(dom)
Exemplo n.º 3
0
def parse_table():
    for tr in get_table().find("tr"):
        tds = tr.find("td")

        if not tds:
            continue

        name = dhtmlparser.removeTags(tds[0])
        mips = dhtmlparser.removeTags(tds[1])
        year = dhtmlparser.removeTags(tds[4])

        # clean mips
        mips = mips.replace("&#160;", " ")
        mips = mips.split("MIPS")[0].replace(",", "").strip()

        yield MIPSInfo(name, float(mips), int(year))
Exemplo n.º 4
0
def _parse_publisher(details):
    """
    Parse publisher of the book.

    Args:
        details (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Publisher's name as string or None if not found.
    """
    publisher = _get_td_or_none(
        details,
        "ctl00_ContentPlaceHolder1_tblRowNakladatel"
    )

    # publisher is not specified
    if not publisher:
        return None

    publisher = dhtmlparser.removeTags(publisher).strip()

    # return None instead of blank string
    if not publisher:
        return None

    return publisher
Exemplo n.º 5
0
def _parse_description(details):
    """
    Parse description of the book.

    Args:
        details (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Details as string with currency or None if not found.
    """
    description = details.find("div", {"class": "detailPopis"})

    # description not found
    if not description:
        return None

    # remove links to ebook version
    ekniha = description[0].find("div", {"class": "ekniha"})
    if ekniha:
        ekniha[0].replaceWith(dhtmlparser.HTMLElement(""))

    # remove links to other books from same cathegory
    detail = description[0].find("p", {"class": "detailKat"})
    if detail:
        detail[0].replaceWith(dhtmlparser.HTMLElement(""))

    # remove all HTML elements
    description = dhtmlparser.removeTags(description[0]).strip()

    # description is blank
    if not description:
        return None

    return description
Exemplo n.º 6
0
    def _parse_meta(self):
        content = self._parse_content_tag()
        meta_vypis_tags = content.find("p", {"class": "meta-vypis"})

        if not meta_vypis_tags:
            return

        meta_vypis_tag = first(meta_vypis_tags)
        has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"})

        if has_tux_tags:
            self.has_tux = True

        # get clean string - another thing which is not semantic at all
        lines = dhtmlparser.removeTags(meta_vypis_tag)

        self.created_ts = parse_timestamp(lines)

        # rest will be picked one by one
        lines = lines.strip().splitlines()

        # parse last modification time
        modified_ts_line = [x for x in lines if "poslední úprava:" in x]
        if modified_ts_line:
            date_string = first(modified_ts_line).split(": ")[-1]
            self.last_modified_ts = parse_timestamp(date_string)

        # parse number of reads
        reads_line = [x for x in lines if "Přečteno:" in x]
        if reads_line:
            reads = first(reads_line).split(":")[-1].split("&")[0]
            self.readed = int(reads)
Exemplo n.º 7
0
    def _parse_meta(self):
        content = self._parse_content_tag()
        meta_vypis_tags = content.find("p", {"class": "meta-vypis"})

        if not meta_vypis_tags:
            return

        meta_vypis_tag = first(meta_vypis_tags)
        has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"})

        if has_tux_tags:
            self.has_tux = True

        # get clean string - another thing which is not semantic at all
        lines = dhtmlparser.removeTags(meta_vypis_tag)

        self.created_ts = parse_timestamp(lines)

        # rest will be picked one by one
        lines = lines.strip().splitlines()

        # parse last modification time
        modified_ts_line = [x for x in lines if "poslední úprava:" in x]
        if modified_ts_line:
            date_string = first(modified_ts_line).split(": ")[-1]
            self.last_modified_ts = parse_timestamp(date_string)

        # parse number of reads
        reads_line = [x for x in lines if "Přečteno:" in x]
        if reads_line:
            reads = first(reads_line).split(":")[-1].split("&")[0]
            self.readed = int(reads)
Exemplo n.º 8
0
def parse_table():
    for tr in get_table().find("tr"):
        tds = tr.find("td")

        if not tds:
            continue

        name = dhtmlparser.removeTags(tds[0])
        mips = dhtmlparser.removeTags(tds[1])
        year = dhtmlparser.removeTags(tds[4])

        # clean mips
        mips = mips.replace("&#160;", " ")
        mips = mips.split("MIPS")[0].replace(",", "").strip()

        yield MIPSInfo(name, float(mips), int(year))
Exemplo n.º 9
0
    def add_concept(self, text, title, ts_of_pub=None):
        """
        Adds new concept into your concepts.

        Args:
            text (str): Text of your concept.
            title (str): Title of your contept. Do not use HTML in title!
            ts_of_pub (int/float, default None): Timestamp of the publication.

        Raises:
            UserWarning: if the site is broken or user was logged out.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        dom = dhtmlparser.parseString(self._get(self.blog_url))

        # get section with links to new blog
        s_sekce = filter(
            lambda x: "Vlož nový zápis" in x.getContent(),
            dom.find("div", {"class": "s_sekce"})
        )
        if not s_sekce:
            raise UserWarning("Can't resolve right div tag!")

        # get link to "add blog" page
        add_blog_link = filter(
            lambda x: "href" in x.params and
                      x.params["href"].endswith("action=add"),
            s_sekce[0].find("a")
        )
        if not add_blog_link:
            raise UserWarning("Can't resolve user number!")
        add_blog_link = add_blog_link[0].params["href"]

        # get "add blog" page
        data = self._get(ABCLINUXU_URL + add_blog_link)
        dom = dhtmlparser.parseString(data)

        form_action = dom.find("form", {"name": "form"})[0].params["action"]

        data = self.session.post(
            ABCLINUXU_URL + form_action,
            data={
                "cid": 0,
                "publish": shared.ts_to_concept_date(ts_of_pub),
                "content": text,
                "title": dhtmlparser.removeTags(title),
                "delay": "Do konceptů",
                "action": "add2"
            },
            verify=False,
        )
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="contentError">')
        check_error_div(data, '<div class="error" id="titleError">')
    def _normalize_fn(cls, filename):
        filename_dom = dhtmlparser.parseString(filename)
        new_filename = dhtmlparser.removeTags(filename_dom).strip()

        new_filename = cls.normalize(new_filename)
        new_filename = cls._remove_html_entities(new_filename)
        new_filename = cls._only_alnum_chars(new_filename)
        new_filename = cls._remove_dup_underscores(new_filename)

        return new_filename
Exemplo n.º 11
0
    def _parse_intro(blog, meta, title_tag):
        """
        Parse intro from the `meta` HTML part.
        """
        intro = blog.getContent().replace(str(meta), "")
        intro = intro.replace(str(title_tag), "")

        signature = blog.find("div", {"class": "signature"})
        if signature:
            intro = intro.replace(str(signature[0]), "")

        return dhtmlparser.removeTags(intro.strip()).strip()
Exemplo n.º 12
0
    def add_concept(self, text, title, ts_of_pub=None):
        """
        Adds new concept into your concepts.

        Args:
            text (str): Text of your concept.
            title (str): Title of your contept. Do not use HTML in title!
            ts_of_pub (int/float, default None): Timestamp of the publication.

        Raises:
            UserWarning: if the site is broken or user was logged out.
        """
        if not self.has_blog:
            raise ValueError("User doesn't have blog!")

        self.login()

        dom = dhtmlparser.parseString(self._get(self.blog_url))

        # get section with links to new blog
        s_sekce = filter(lambda x: "Vlož nový zápis" in x.getContent(),
                         dom.find("div", {"class": "s_sekce"}))
        if not s_sekce:
            raise UserWarning("Can't resolve right div tag!")

        # get link to "add blog" page
        add_blog_link = filter(
            lambda x: "href" in x.params and x.params["href"].endswith(
                "action=add"), s_sekce[0].find("a"))
        if not add_blog_link:
            raise UserWarning("Can't resolve user number!")
        add_blog_link = add_blog_link[0].params["href"]

        # get "add blog" page
        data = self._get(ABCLINUXU_URL + add_blog_link)
        dom = dhtmlparser.parseString(data)

        form_action = dom.find("form", {"name": "form"})[0].params["action"]

        data = self.session.post(
            ABCLINUXU_URL + form_action,
            data={
                "cid": 0,
                "publish": shared.ts_to_concept_date(ts_of_pub),
                "content": text,
                "title": dhtmlparser.removeTags(title),
                "delay": "Do konceptů",
                "action": "add2"
            },
            verify=False,
        )
        data = data.text.encode("utf-8")
        check_error_div(data, '<div class="error" id="contentError">')
Exemplo n.º 13
0
    def _parse_intro(blog, meta, title_tag):
        """
        Parse intro from the `meta` HTML part.
        """
        intro = blog.getContent().replace(str(meta), "")
        intro = intro.replace(str(title_tag), "")

        signature = blog.find("div", {"class": "signature"})
        if signature:
            intro = intro.replace(str(signature[0]), "")

        return dhtmlparser.removeTags(intro.strip()).strip()
Exemplo n.º 14
0
    def _add_item_to_feed(cls, registry, feed, post):
        title_dom = dhtmlparser.parseString(post.title)

        link = title_dom.find("a")[0]
        href = link.params.get("href", "")

        if registry.is_ref_str(href):
            item = registry.item_by_ref_str(href)

            title = item.title
            url = settings.blog_url

            path = item.path
            if not path.startswith("/") and not url.endswith("/"):
                url += "/"

            url += path
        else:
            url = href
            title = dhtmlparser.removeTags(link.getContent())

        # bleh
        my_timezone = pytz.timezone(str(tzlocal.get_localzone()))
        timezone = datetime.datetime.now(my_timezone).strftime('%z')

        raw_date = dhtmlparser.removeTags(post.timestamp).replace("@", "")
        pub_date = dateparser.parse(raw_date,
                                    settings={
                                        'TIMEZONE': 'CET',
                                        'RETURN_AS_TIMEZONE_AWARE': True
                                    })

        entry = feed.add_entry()
        entry.id(url)
        entry.title(title)
        entry.link(href=url)
        entry.updated(pub_date)
        entry.published(pub_date)
        entry.author({'name': settings.twitter_handle.replace("@", "")})
        entry.summary(post.description_clean or "No description.", type="text")
Exemplo n.º 15
0
def parse(data):
    dom = dhtmlparser.parseString(data)

    for preview in dom.find("div", {"class": "articlePreview"}):
        title_and_link = preview.find("h2")

        # skip items without <h2>
        if not title_and_link:
            continue

        title_and_link = title_and_link[0]

        title = dhtmlparser.removeTags(title_and_link.getContent())
        link = _parse_link(title_and_link)
        date = _parse_date(preview)

        yield title, link, date
Exemplo n.º 16
0
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog
Exemplo n.º 17
0
    def from_html(html, lazy=True):
        """
        Convert HTML string to :class:`Blogpost` instance.

        Args:
            html (str): Input data.
            lazy (bool, default True): Be lazy (don't pull data by yourself
                 from the site). Call :meth:`pull` for active download of all
                 required informations.

        Returns:
            obj: :class:`Blogpost` instance.
        """
        if not isinstance(html, dhtmlparser.HTMLElement):
            html = dhtmlparser.parseString(html)
            dhtmlparser.makeDoubleLinked(html)

        # support for legacy blogs
        title_tag = html.find("h2", {"class": "st_nadpis"})
        if title_tag:
            title_tag = first(title_tag)
            rel_link = first(title_tag.find("a")).params["href"]
            link = url_context(rel_link)
        else:
            title_tag = first(html.find("h2"))
            link = first(html.find("link", {"rel": "canonical"}))
            link = link.params["href"]

        title = dhtmlparser.removeTags(title_tag).strip()

        # get meta
        meta = html.find("p", {"class": "meta-vypis"})[0]

        blog = Blogpost(url=link, lazy=lazy)

        if lazy:
            blog.title = title
            blog.intro = Blogpost._parse_intro(html, meta, title_tag)
            blog.rating = Blogpost._parse_rating_from_preview(meta)
            blog.created_ts = parse_timestamp(meta)
            blog.comments_n = Blogpost._parse_comments_n(meta)

        return blog
Exemplo n.º 18
0
def _parse_price(html_chunk):
    """
    Parse price of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Price as string with currency or None if not found.
    """
    price = get_first_content(
        html_chunk.find("div", {"class": "prices"})
    )

    if not price:
        return None

    # it is always in format Cena:\n150kč
    price = dhtmlparser.removeTags(price)
    price = price.split("\n")[-1]

    return price
Exemplo n.º 19
0
    def title(self):
        if self.__dict__.get("title") is not None:
            return self.__dict__["title"]

        headings = []
        headings.extend(self.dom.find("title"))
        headings.extend(self.dom.find("h1"))
        headings.extend(self.dom.find("h2"))
        headings.extend(self.dom.find("h3"))
        headings.extend(self.dom.find("h4"))
        headings.extend(self.dom.find("h5"))
        headings.extend(self.dom.find("h6"))

        for h in headings:
            heading_content = dhtmlparser.removeTags(h.getContent())
            heading_content = heading_content.strip()

            # remove unnecessary spaces
            heading_content = " ".join(heading_content.split())

            if heading_content:
                return heading_content
Exemplo n.º 20
0
def _parse_description(html_chunk):
    """
    Parse description of the book.

    Args:
        html_chunk (obj): HTMLElement containing slice of the page with details.

    Returns:
        str/None: Description as string or None if not found.
    """
    description_tag = html_chunk.match(
        ["div", {"class": "kniha_detail_text"}],
        "p"
    )

    if not description_tag:
        return None

    description = get_first_content(description_tag)
    description = description.replace("<br />", "\n")
    description = description.replace("<br/>", "\n")

    return dhtmlparser.removeTags(description).strip()
Exemplo n.º 21
0
    def _izolate_username(head_tag):
        user_tag = head_tag.find(
            "a",
            fn=lambda x: x.params.get("href", "").startswith("/lide/")
        )

        if user_tag:
            user_link = first(user_tag).params["href"]

            # /lide/manasekp -> manasekp
            real_username = user_link.split("/")[2]

            return real_username, True  # registered

        # parse unregistered username from unstructured HTML like:
        #         10.2. 21:53
        #
        #       Tomáškova máma

        str_repr = dhtmlparser.removeTags(head_tag.getContent())

        # remove blank lines
        lines = [x.strip() for x in str_repr.splitlines() if x.strip()]

        # izolate line with time
        line_with_time = first(date_izolator(lines))

        # pick line next to line with time
        username = lines[lines.index(line_with_time) + 1]

        def clean_username(username):
            if username == "Rozbalit":  # no username was found
                return ""

            return username.strip()

        return clean_username(username), False  # unregistered
Exemplo n.º 22
0
def detect_language(index_page):
    """
    Detect `languages` using `langdetect` library.

    Args:
        index_page (str): HTML content of the page you wish to analyze.

    Returns:
        obj: One :class:`.SourceString` object.
    """
    dom = dhtmlparser.parseString(index_page)

    clean_content = dhtmlparser.removeTags(dom)

    lang = None
    try:
        lang = langdetect.detect(clean_content)
    except UnicodeDecodeError:
        lang = langdetect.detect(clean_content.decode("utf-8"))

    return SourceString(
        lang,
        source="langdetect"
    )
Exemplo n.º 23
0
def test_remove_tags_str_input():
    inp = "a<b>xax<i>xe</i>xi</b>d"

    assert dhtmlparser.removeTags(inp) == "axaxxexid"
Exemplo n.º 24
0
    def title(self):
        if self.alt_title:
            return self.alt_title

        title_el = self.dom.find("h1", {"class": "page-title"})[0]
        return dhtmlparser.removeTags(title_el.__str__()).strip()
Exemplo n.º 25
0
def test_remove_tags_str_input():
    inp = "a<b>xax<i>xe</i>xi</b>d"

    assert dhtmlparser.removeTags(inp) == "axaxxexid"