def test_remove_tags(): dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d") assert dhtmlparser.removeTags(dom) == "axaxxexid" dom = dhtmlparser.parseString("<b></b>") assert not dhtmlparser.removeTags(dom) dom = dhtmlparser.parseString("<b><i></b>") assert not dhtmlparser.removeTags(dom) dom = dhtmlparser.parseString("<b><!-- asd --><i></b>") assert not dhtmlparser.removeTags(dom)
def parse_table(): for tr in get_table().find("tr"): tds = tr.find("td") if not tds: continue name = dhtmlparser.removeTags(tds[0]) mips = dhtmlparser.removeTags(tds[1]) year = dhtmlparser.removeTags(tds[4]) # clean mips mips = mips.replace(" ", " ") mips = mips.split("MIPS")[0].replace(",", "").strip() yield MIPSInfo(name, float(mips), int(year))
def _parse_publisher(details): """ Parse publisher of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Publisher's name as string or None if not found. """ publisher = _get_td_or_none( details, "ctl00_ContentPlaceHolder1_tblRowNakladatel" ) # publisher is not specified if not publisher: return None publisher = dhtmlparser.removeTags(publisher).strip() # return None instead of blank string if not publisher: return None return publisher
def _parse_description(details): """ Parse description of the book. Args: details (obj): HTMLElement containing slice of the page with details. Returns: str/None: Details as string with currency or None if not found. """ description = details.find("div", {"class": "detailPopis"}) # description not found if not description: return None # remove links to ebook version ekniha = description[0].find("div", {"class": "ekniha"}) if ekniha: ekniha[0].replaceWith(dhtmlparser.HTMLElement("")) # remove links to other books from same cathegory detail = description[0].find("p", {"class": "detailKat"}) if detail: detail[0].replaceWith(dhtmlparser.HTMLElement("")) # remove all HTML elements description = dhtmlparser.removeTags(description[0]).strip() # description is blank if not description: return None return description
def _parse_meta(self): content = self._parse_content_tag() meta_vypis_tags = content.find("p", {"class": "meta-vypis"}) if not meta_vypis_tags: return meta_vypis_tag = first(meta_vypis_tags) has_tux_tags = meta_vypis_tag.find("img", {"class": "blog_digest"}) if has_tux_tags: self.has_tux = True # get clean string - another thing which is not semantic at all lines = dhtmlparser.removeTags(meta_vypis_tag) self.created_ts = parse_timestamp(lines) # rest will be picked one by one lines = lines.strip().splitlines() # parse last modification time modified_ts_line = [x for x in lines if "poslední úprava:" in x] if modified_ts_line: date_string = first(modified_ts_line).split(": ")[-1] self.last_modified_ts = parse_timestamp(date_string) # parse number of reads reads_line = [x for x in lines if "Přečteno:" in x] if reads_line: reads = first(reads_line).split(":")[-1].split("&")[0] self.readed = int(reads)
def add_concept(self, text, title, ts_of_pub=None): """ Adds new concept into your concepts. Args: text (str): Text of your concept. title (str): Title of your contept. Do not use HTML in title! ts_of_pub (int/float, default None): Timestamp of the publication. Raises: UserWarning: if the site is broken or user was logged out. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() dom = dhtmlparser.parseString(self._get(self.blog_url)) # get section with links to new blog s_sekce = filter( lambda x: "Vlož nový zápis" in x.getContent(), dom.find("div", {"class": "s_sekce"}) ) if not s_sekce: raise UserWarning("Can't resolve right div tag!") # get link to "add blog" page add_blog_link = filter( lambda x: "href" in x.params and x.params["href"].endswith("action=add"), s_sekce[0].find("a") ) if not add_blog_link: raise UserWarning("Can't resolve user number!") add_blog_link = add_blog_link[0].params["href"] # get "add blog" page data = self._get(ABCLINUXU_URL + add_blog_link) dom = dhtmlparser.parseString(data) form_action = dom.find("form", {"name": "form"})[0].params["action"] data = self.session.post( ABCLINUXU_URL + form_action, data={ "cid": 0, "publish": shared.ts_to_concept_date(ts_of_pub), "content": text, "title": dhtmlparser.removeTags(title), "delay": "Do konceptů", "action": "add2" }, verify=False, ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="contentError">') check_error_div(data, '<div class="error" id="titleError">')
def _normalize_fn(cls, filename): filename_dom = dhtmlparser.parseString(filename) new_filename = dhtmlparser.removeTags(filename_dom).strip() new_filename = cls.normalize(new_filename) new_filename = cls._remove_html_entities(new_filename) new_filename = cls._only_alnum_chars(new_filename) new_filename = cls._remove_dup_underscores(new_filename) return new_filename
def _parse_intro(blog, meta, title_tag): """ Parse intro from the `meta` HTML part. """ intro = blog.getContent().replace(str(meta), "") intro = intro.replace(str(title_tag), "") signature = blog.find("div", {"class": "signature"}) if signature: intro = intro.replace(str(signature[0]), "") return dhtmlparser.removeTags(intro.strip()).strip()
def add_concept(self, text, title, ts_of_pub=None): """ Adds new concept into your concepts. Args: text (str): Text of your concept. title (str): Title of your contept. Do not use HTML in title! ts_of_pub (int/float, default None): Timestamp of the publication. Raises: UserWarning: if the site is broken or user was logged out. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() dom = dhtmlparser.parseString(self._get(self.blog_url)) # get section with links to new blog s_sekce = filter(lambda x: "Vlož nový zápis" in x.getContent(), dom.find("div", {"class": "s_sekce"})) if not s_sekce: raise UserWarning("Can't resolve right div tag!") # get link to "add blog" page add_blog_link = filter( lambda x: "href" in x.params and x.params["href"].endswith( "action=add"), s_sekce[0].find("a")) if not add_blog_link: raise UserWarning("Can't resolve user number!") add_blog_link = add_blog_link[0].params["href"] # get "add blog" page data = self._get(ABCLINUXU_URL + add_blog_link) dom = dhtmlparser.parseString(data) form_action = dom.find("form", {"name": "form"})[0].params["action"] data = self.session.post( ABCLINUXU_URL + form_action, data={ "cid": 0, "publish": shared.ts_to_concept_date(ts_of_pub), "content": text, "title": dhtmlparser.removeTags(title), "delay": "Do konceptů", "action": "add2" }, verify=False, ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="contentError">')
def _add_item_to_feed(cls, registry, feed, post): title_dom = dhtmlparser.parseString(post.title) link = title_dom.find("a")[0] href = link.params.get("href", "") if registry.is_ref_str(href): item = registry.item_by_ref_str(href) title = item.title url = settings.blog_url path = item.path if not path.startswith("/") and not url.endswith("/"): url += "/" url += path else: url = href title = dhtmlparser.removeTags(link.getContent()) # bleh my_timezone = pytz.timezone(str(tzlocal.get_localzone())) timezone = datetime.datetime.now(my_timezone).strftime('%z') raw_date = dhtmlparser.removeTags(post.timestamp).replace("@", "") pub_date = dateparser.parse(raw_date, settings={ 'TIMEZONE': 'CET', 'RETURN_AS_TIMEZONE_AWARE': True }) entry = feed.add_entry() entry.id(url) entry.title(title) entry.link(href=url) entry.updated(pub_date) entry.published(pub_date) entry.author({'name': settings.twitter_handle.replace("@", "")}) entry.summary(post.description_clean or "No description.", type="text")
def parse(data): dom = dhtmlparser.parseString(data) for preview in dom.find("div", {"class": "articlePreview"}): title_and_link = preview.find("h2") # skip items without <h2> if not title_and_link: continue title_and_link = title_and_link[0] title = dhtmlparser.removeTags(title_and_link.getContent()) link = _parse_link(title_and_link) date = _parse_date(preview) yield title, link, date
def from_html(html, lazy=True): """ Convert HTML string to :class:`Blogpost` instance. Args: html (str): Input data. lazy (bool, default True): Be lazy (don't pull data by yourself from the site). Call :meth:`pull` for active download of all required informations. Returns: obj: :class:`Blogpost` instance. """ if not isinstance(html, dhtmlparser.HTMLElement): html = dhtmlparser.parseString(html) dhtmlparser.makeDoubleLinked(html) # support for legacy blogs title_tag = html.find("h2", {"class": "st_nadpis"}) if title_tag: title_tag = first(title_tag) rel_link = first(title_tag.find("a")).params["href"] link = url_context(rel_link) else: title_tag = first(html.find("h2")) link = first(html.find("link", {"rel": "canonical"})) link = link.params["href"] title = dhtmlparser.removeTags(title_tag).strip() # get meta meta = html.find("p", {"class": "meta-vypis"})[0] blog = Blogpost(url=link, lazy=lazy) if lazy: blog.title = title blog.intro = Blogpost._parse_intro(html, meta, title_tag) blog.rating = Blogpost._parse_rating_from_preview(meta) blog.created_ts = parse_timestamp(meta) blog.comments_n = Blogpost._parse_comments_n(meta) return blog
def _parse_price(html_chunk): """ Parse price of the book. Args: html_chunk (obj): HTMLElement containing slice of the page with details. Returns: str/None: Price as string with currency or None if not found. """ price = get_first_content( html_chunk.find("div", {"class": "prices"}) ) if not price: return None # it is always in format Cena:\n150kč price = dhtmlparser.removeTags(price) price = price.split("\n")[-1] return price
def title(self): if self.__dict__.get("title") is not None: return self.__dict__["title"] headings = [] headings.extend(self.dom.find("title")) headings.extend(self.dom.find("h1")) headings.extend(self.dom.find("h2")) headings.extend(self.dom.find("h3")) headings.extend(self.dom.find("h4")) headings.extend(self.dom.find("h5")) headings.extend(self.dom.find("h6")) for h in headings: heading_content = dhtmlparser.removeTags(h.getContent()) heading_content = heading_content.strip() # remove unnecessary spaces heading_content = " ".join(heading_content.split()) if heading_content: return heading_content
def _parse_description(html_chunk): """ Parse description of the book. Args: html_chunk (obj): HTMLElement containing slice of the page with details. Returns: str/None: Description as string or None if not found. """ description_tag = html_chunk.match( ["div", {"class": "kniha_detail_text"}], "p" ) if not description_tag: return None description = get_first_content(description_tag) description = description.replace("<br />", "\n") description = description.replace("<br/>", "\n") return dhtmlparser.removeTags(description).strip()
def _izolate_username(head_tag): user_tag = head_tag.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/") ) if user_tag: user_link = first(user_tag).params["href"] # /lide/manasekp -> manasekp real_username = user_link.split("/")[2] return real_username, True # registered # parse unregistered username from unstructured HTML like: # 10.2. 21:53 # # Tomáškova máma str_repr = dhtmlparser.removeTags(head_tag.getContent()) # remove blank lines lines = [x.strip() for x in str_repr.splitlines() if x.strip()] # izolate line with time line_with_time = first(date_izolator(lines)) # pick line next to line with time username = lines[lines.index(line_with_time) + 1] def clean_username(username): if username == "Rozbalit": # no username was found return "" return username.strip() return clean_username(username), False # unregistered
def detect_language(index_page): """ Detect `languages` using `langdetect` library. Args: index_page (str): HTML content of the page you wish to analyze. Returns: obj: One :class:`.SourceString` object. """ dom = dhtmlparser.parseString(index_page) clean_content = dhtmlparser.removeTags(dom) lang = None try: lang = langdetect.detect(clean_content) except UnicodeDecodeError: lang = langdetect.detect(clean_content.decode("utf-8")) return SourceString( lang, source="langdetect" )
def test_remove_tags_str_input(): inp = "a<b>xax<i>xe</i>xi</b>d" assert dhtmlparser.removeTags(inp) == "axaxxexid"
def title(self): if self.alt_title: return self.alt_title title_el = self.dom.find("h1", {"class": "page-title"})[0] return dhtmlparser.removeTags(title_el.__str__()).strip()