def _get_last_five_tags(self): top_tag_code = '<div id="last_five_top">\n%s\n</div>' % self.last_five_html top_tag = dhtmlparser.parseString(top_tag_code).find("div")[0] bottom_tag_code = '<div id="last_five_bottom">\n%s\n</div>' % self.last_five_html bottom_tag = dhtmlparser.parseString(bottom_tag_code).find("div")[0] return top_tag, bottom_tag
def add_concept(self, text, title, ts_of_pub=None): """ Adds new concept into your concepts. Args: text (str): Text of your concept. title (str): Title of your contept. Do not use HTML in title! ts_of_pub (int/float, default None): Timestamp of the publication. Raises: UserWarning: if the site is broken or user was logged out. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() dom = dhtmlparser.parseString(self._get(self.blog_url)) # get section with links to new blog s_sekce = filter( lambda x: "Vlož nový zápis" in x.getContent(), dom.find("div", {"class": "s_sekce"}) ) if not s_sekce: raise UserWarning("Can't resolve right div tag!") # get link to "add blog" page add_blog_link = filter( lambda x: "href" in x.params and x.params["href"].endswith("action=add"), s_sekce[0].find("a") ) if not add_blog_link: raise UserWarning("Can't resolve user number!") add_blog_link = add_blog_link[0].params["href"] # get "add blog" page data = self._get(ABCLINUXU_URL + add_blog_link) dom = dhtmlparser.parseString(data) form_action = dom.find("form", {"name": "form"})[0].params["action"] data = self.session.post( ABCLINUXU_URL + form_action, data={ "cid": 0, "publish": shared.ts_to_concept_date(ts_of_pub), "content": text, "title": dhtmlparser.removeTags(title), "delay": "Do konceptů", "action": "add2" }, verify=False, ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="contentError">') check_error_div(data, '<div class="error" id="titleError">')
def test_remove_tags(): dom = dhtmlparser.parseString("a<b>xax<i>xe</i>xi</b>d") assert dhtmlparser.removeTags(dom) == "axaxxexid" dom = dhtmlparser.parseString("<b></b>") assert not dhtmlparser.removeTags(dom) dom = dhtmlparser.parseString("<b><i></b>") assert not dhtmlparser.removeTags(dom) dom = dhtmlparser.parseString("<b><!-- asd --><i></b>") assert not dhtmlparser.removeTags(dom)
def add_concept(self, text, title, ts_of_pub=None): """ Adds new concept into your concepts. Args: text (str): Text of your concept. title (str): Title of your contept. Do not use HTML in title! ts_of_pub (int/float, default None): Timestamp of the publication. Raises: UserWarning: if the site is broken or user was logged out. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() dom = dhtmlparser.parseString(self._get(self.blog_url)) # get section with links to new blog s_sekce = filter(lambda x: "Vlož nový zápis" in x.getContent(), dom.find("div", {"class": "s_sekce"})) if not s_sekce: raise UserWarning("Can't resolve right div tag!") # get link to "add blog" page add_blog_link = filter( lambda x: "href" in x.params and x.params["href"].endswith( "action=add"), s_sekce[0].find("a")) if not add_blog_link: raise UserWarning("Can't resolve user number!") add_blog_link = add_blog_link[0].params["href"] # get "add blog" page data = self._get(ABCLINUXU_URL + add_blog_link) dom = dhtmlparser.parseString(data) form_action = dom.find("form", {"name": "form"})[0].params["action"] data = self.session.post( ABCLINUXU_URL + form_action, data={ "cid": 0, "publish": shared.ts_to_concept_date(ts_of_pub), "content": text, "title": dhtmlparser.removeTags(title), "delay": "Do konceptů", "action": "add2" }, verify=False, ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="contentError">')
def parse(self): try: self.dom = dhtmlparser.parseString(self.content) except UnicodeDecodeError: self.content = self.content.encode("utf-8") self.dom = dhtmlparser.parseString(self.content) self.cropped_content = self.crop_content( content=self.content, dom=self.dom ) self.cropped_dom = dhtmlparser.parseString(self.cropped_content)
def parse_to_url(to_url='', title_word=''): "http://gaoxiao.jokeji.cn/GrapHtml/quweigaoxiao/20140709211210.htm" html_doc = urllib2.urlopen(to_url).read() html_doc = html_doc.decode('gbk').encode('utf8') dom = d.parseString(html_doc) target_uls = [] #要找的目标ul #找出所有的ul先 uls = dom.find('ul') #找出目标ul的列表 for ul in uls: ul_content = ul.getContent() sub_dom = d.parseString(ul_content) #rule 1,ul下面应该找不到a标签 aes = sub_dom.find('a') if len(aes) > 0: continue #rule 2,ul下面应该能找到b标签 bes = sub_dom.find('b') if len(bes) == 0: continue #rule 3 ,ul下面可以找到至少一个img ,且有属性style="CURSOR: hand" handes = sub_dom.find('img',{"style":"CURSOR: hand"}) if len(handes) == 0: continue #最后余下的应该就是目标了 target_uls.append(ul) info = [] #解析目标ul for ul in target_uls: text = ul.getContent() #找出下面所有的img标签 sub_dom = d.parseString(text) imgs = sub_dom.find('img') for img in imgs: src = base_url + img.params['src'] title = img.params['alt'] info.append({'src':src, 'title':title, 'overview':title_word}) #保存起来 for i in info: #print i['src'],i['title'] save_img(i)
def _add_sidebar_skeletons_to_page( self, page: 'HtmlPage') -> Tuple[HTMLElement, HTMLElement]: top_tag_code = """<div id="sidebar_top"></div>""" bottom_tag_code = '<div id="sidebar_bottom">\n</div>' top_tag = dhtmlparser.parseString(top_tag_code).find("div")[0] bottom_tag = dhtmlparser.parseString(bottom_tag_code).find("div")[0] body_tag = page.dom.find("body")[0] body_tag.childs.insert(0, top_tag) body_tag.childs.append(bottom_tag) return top_tag, bottom_tag
def _add_syntax_highlight_for(cls, lexer, code, code_content): formatter = HtmlFormatter(wrapcode=False) colored_text = highlight(code_content, lexer(), formatter) pre_tag = dhtmlparser.parseString(colored_text).find("pre")[0] # wrap content of the <pre> to the <code> code_tag = dhtmlparser.parseString("<code></code>").find("code")[0] code_tag.childs = pre_tag.childs pre_tag.childs = [code_tag] pre_tag.params["class"] = "code" code.parent.replaceWith(pre_tag)
def test_parseString_cip(): dom = dhtmlparser.parseString( """<html><tag PARAM="true"></html>""", cip=False ) assert dom.childs assert len(dom.childs) == 2 assert dom.childs[0].getTagName() == "html" assert dom.childs[1].getTagName() == "html" assert dom.childs[0].isOpeningTag() assert dom.childs[1].isEndTag() assert dom.childs[0].childs assert not dom.childs[1].childs assert dom.childs[0].childs[0].getTagName() == "tag" assert dom.childs[0].childs[0].params assert not dom.childs[0].childs[0].childs assert "param" not in dom.childs[0].childs[0].params assert "PARAM" in dom.childs[0].childs[0].params assert dom.childs[0].childs[0].params["PARAM"] == "true" with pytest.raises(KeyError): dom.childs[0].childs[0].params["param"]
def _parse_tags(tags_xml): tags_dom = dhtmlparser.parseString(tags_xml) # see http://www.abclinuxu.cz/ajax/tags/list for details return [ Tag(tag.params["l"], tag.params["i"]) for tag in tags_dom.find("s") ]
def getListOfBases(): """ This function is here mainly for purposes of unittest Returns: list of str: Valid bases as they are used as URL parameters in links at Aleph main page. """ downer = Downloader() data = downer.download(ALEPH_URL + "/F/?func=file&file_name=base-list") dom = dhtmlparser.parseString(data.lower()) # from default aleph page filter links containing local_base in their href base_links = filter( lambda x: "href" in x.params and "local_base" in x.params["href"], dom.find("a") ) # split links by & - we will need only XXX from link.tld/..&local_base=XXX base_links = map( lambda x: x.params["href"].replace("?", "&", 1).split("&"), base_links ) # filter only sections containing bases bases = map( lambda link: filter(lambda base: "local_base=" in base, link)[0], base_links ) # filter bases from base sections bases = map(lambda x: x.split("=")[1].strip(), bases) return list(set(bases)) # list(set()) is same as unique()
def get_blogposts(self): """ Lists all of users PUBLISHED blogposts. For unpublished, see :meth:`get_concepts`. Returns: list: sorted (old->new) list of Blogpost objects. """ if not self.has_blog: return [] def cut_crap(data): data = data.split( '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>')[0] return data.split('<div class="st" id="st">')[1] cnt = 0 posts = [] parsed = [1] # just placeholder for first iteration while parsed: data = self._get(self._compose_blogposts_url(cnt)) dom = dhtmlparser.parseString(cut_crap(data)) parsed = [ Blogpost.from_html(blog_html) for blog_html in dom.find("div", {"class": "cl"}) ] posts.extend(parsed) cnt += BLOG_STEP return sorted(posts, key=lambda x: x.created_ts)
def remove_fluff(self, body): empty = dhtmlparser.parseString("") def replace(selector): for el in selector: el.replaceWith(empty) replace(body.find("p", {"id": "copyright"})) replace(body.find("aside", {"id": "sidebar"})) replace(body.find("nav", {"id": "next-page"})) replace(body.find("div", {"id": "comment_bubble_wrapper"})) replace(body.find("div", {"class": "nocontent"})) replace(body.find("div", {"class": "tertiary-content-wrapper"})) replace(body.find("div", {"class": "more-link"})) replace(body.find("div", {"class": "view-content"})) replace(body.find("div", {"class": "block-content content"})) replace(body.find("div", {"class": "region region-content-aside"})) replace(body.find("div", {"role": "search"})) replace( body.find("div", fn=lambda x: "block-Eggplant-navigation" in x.params.get( "class", ""))) replace(body.find("header")) replace(body.find("div", {"id": "tertiary-content-wrapper"})) replace(body.find("nav", {"class": "clearfix"})) return body.find("article", {"id": "article"})[0]
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download(url_context(self._meta["Přidej obrázek"]), session=self._session) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post(url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file}) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def get_concepts(self): """ Return all concepts (unpublished blogs). Returns: list: List of Concept objects. """ if not self.has_blog: raise ValueError("User doesn't have blog!") self.login() # get the f*****g untagged part of the site, where the links to the # concepts are stored data = self._get(self.blog_url) if '<div class="s_nadpis">Rozepsané zápisy</div>' not in data: return [] data = data.split('<div class="s_nadpis">Rozepsané zápisy</div>')[1] dom = dhtmlparser.parseString(data) concept_list = dom.find("div", {"class": "s_sekce"})[0] # links to concepts are stored in <li> concepts = [] for li in concept_list.find("li"): a = li.find("a")[0] concepts.append( Concept(title=a.getContent().strip(), link=a.params["href"], session=self.session)) return concepts
def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/")) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def _get_user_id(self): """ Resolve user's ID number for logged user. Returns: str: USER id as string. """ if self._user_id is not None: return self._user_id self.login() dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL)) # resolve user's navigation panel nav_bar = dom.match( ["div", { "class": "hl_vpravo" }], { "tag_name": "a", "fn": lambda x: x.params.get("href", "").startswith("/Profile") }) if not nav_bar: raise ValueError("Can't parse user's navigation bar!") profile_link = first(nav_bar).params["href"] # transform /Profile/24642?action=myPage -> 24642 self._user_id = profile_link.split("?")[0].split("/")[-1] return self._user_id
def get_html_lang_tags(index_page): """ Return `languages` stored in ``<meta>`` tags. ``<meta http-equiv="Content-language" content="cs">`` -> ``cs`` Args: index_page (str): HTML content of the page you wish to analyze. Returns: list: List of :class:`.SourceString` objects. """ dom = dhtmlparser.parseString(index_page) lang_tag = "content-language" lang_tags = dom.find( "meta", fn=lambda x: x.params.get("http-equiv", "").lower() == lang_tag ) return [ SourceString(tag.params["content"], "HTML") for tag in lang_tags if "content" in tag.params ]
def _parse_format_pages_isbn(html_chunk): """ Parse format, number of pages and ISBN. Args: html_chunk (obj): HTMLElement containing slice of the page with details. Returns: tuple: (format, pages, isbn), all as string. """ ppi = get_first_content( html_chunk.find("div", {"class": "price-overflow"}) ) if not ppi: return None, None, None # all information this function should parse are at one line ppi = filter(lambda x: x.strip(), ppi.split("<br />"))[0] # parse isbn isbn = dhtmlparser.parseString(ppi) isbn = isbn.find("b") isbn = isbn[0].getContent() if isbn else None # parse pages and format pages = None book_format = None details = ppi.split("|") if len(details) >= 2: book_format = details[0].strip() pages = details[1].strip() return book_format, pages, isbn
def cut_dom_to_area_of_interest(html): dom = html # make sure, that you don't modify `html` parameter if not isinstance(html, dhtmlparser.HTMLElement): dom = dhtmlparser.parseString(html) else: dom = copy.deepcopy(dom) dhtmlparser.makeDoubleLinked(dom) # comments are not stored in hierarchical structure, but in somehow # flat-nested lists # locate end of article ds_toolbox = dom.find("div", {"class": "ds_toolbox"}) if not ds_toolbox: raise ValueError("Couldn't locate ds_toolbox!") ds_toolbox = first(ds_toolbox) dom = ds_toolbox.parent # ged rid of everything until end of the article while dom.childs[0] != ds_toolbox: dom.childs.pop(0) dom.childs.pop(0) return dom
def login(username, password, http_proxy = None): """ Just login into spotify. This is usefull, because users from unsupported countries have to login thru IP from supported country every ~twoweeks, or their account is frozen until they do so. Function supports http_proxy parameter in format "http://server:port". Raise: - SpotifierException if there is some problem. """ d = Downloader(http_proxy = http_proxy) dom = html.parseString( d.download( "https://www.spotify.com/us/login/?forward_url=%2Fus%2F", ) ) log_form = { "referrer": "", "utm-keywords": dom.find("input", {"name": "utm-keywords"})[0].params["value"], "user_name": username, "password": password } data = d.download( "https://www.spotify.com/us/xhr/json/login.php", post = log_form, ) jdata = json.loads(data) if jdata["error"]: raise SpotifierException(jdata["msg"])
def _process_book(html_chunk): """ Parse available informations about book from the book details page. Args: html_chunk (obj): HTMLElement containing slice of the page with details. Returns: obj: :class:`structures.Publication` instance with book details. """ title, book_url = _parse_title_url(html_chunk) # download page with details data = DOWNER.download(book_url) dom = dhtmlparser.parseString( handle_encodnig(data) ) details = dom.find("div", {"id": "kniha_detail"})[0] # required parameters pub = Publication( title=title, authors=_parse_authors(html_chunk), price=_parse_price(details), publisher="CPress" ) # optional parameters pub.optionals.URL = book_url pub.optionals.EAN = _parse_ean(details) pub.optionals.format = _parse_format(details) pub.optionals.pub_date = _parse_date(details) pub.optionals.description = _parse_description(details) return pub
def get_table(): page = downloader.download( "https://en.wikipedia.org/wiki/Instructions_per_second") dom = dhtmlparser.parseString(page) return dom.find("table", {"class": "wikitable sortable"})[0]
def iter_blogposts(start=0, end=None, lazy=True): """ Iterate over blogs. Based at bloglist. Args: start (int, default 0): Start at this page. end (int, default None): End at this page. lazy (bool, default True): Initialize :class:`.Blogpost` objects only with informations from listings. Don't download full text and comments. Yields: obj: :class:`.Blogpost` objects. """ for cnt, url in enumerate(_next_blog_url(start)): data = _shared.download(url) data = _remove_crap_from_bloglist(data) # parse basic info about all blogs at page dom = _dhtmlparser.parseString(data) for bcnt, blog in enumerate(dom.findB("div", {"class": "cl"})): yield Blogpost.from_html(blog, lazy=lazy) # every page has 25 blogposts, but somethimes I am getting more if bcnt >= 24: break # detect end of pagination at the bottom if not _should_continue(dom): break if end is not None and cnt >= end: break
def _get_user_id(self): """ Resolve user's ID number for logged user. Returns: str: USER id as string. """ if self._user_id is not None: return self._user_id self.login() dom = dhtmlparser.parseString(self._get(ABCLINUXU_URL)) # resolve user's navigation panel nav_bar = dom.match( ["div", {"class": "hl_vpravo"}], { "tag_name": "a", "fn": lambda x: x.params.get("href", "").startswith("/Profile") } ) if not nav_bar: raise ValueError("Can't parse user's navigation bar!") profile_link = first(nav_bar).params["href"] # transform /Profile/24642?action=myPage -> 24642 self._user_id = profile_link.split("?")[0].split("/")[-1] return self._user_id
def reg_header(): return dhtmlparser.parseString(""" <div class="ds_hlavicka" id="9"> <div class="ds_reseni" style="display:none"> </div> 11.2. 15:21 <a href="/lide/manasekp">manasekp</a> | skóre: 27 | blog: <a href="/blog/manasekp">manasekp</a> | Brno <br> <span class="ds_control_sbalit2" id="comment9_toggle2"> <a onClick="schovej_vlakno(9)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a> <a onClick="rozbal_vse(9)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a> </span> Re: Bolest proxy <div id="comment9_controls"> <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&dizId=210591&threadId=9">Odpovědět</a> | <a onClick="schovej_vlakno(9)" id="comment9_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a> | <a href="#2" title="Odkaz na komentář o jednu úroveň výše">Výše</a> | <a href="#9" title="Přímá adresa na tento komentář">Link</a> | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&bUid=9480&url=/blog/show/400959#9" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a> | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&threadId=9" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a> </div> </div>""").find("div")[0]
def get_blogposts(self): """ Lists all of users PUBLISHED blogposts. For unpublished, see :meth:`get_concepts`. Returns: list: sorted (old->new) list of Blogpost objects. """ if not self.has_blog: return [] def cut_crap(data): data = data.split( '<div class="s_nadpis linkbox_nadpis">Píšeme jinde</div>' )[0] return data.split('<div class="st" id="st">')[1] cnt = 0 posts = [] parsed = [1] # just placeholder for first iteration while parsed: data = self._get(self._compose_blogposts_url(cnt)) dom = dhtmlparser.parseString(cut_crap(data)) parsed = [ Blogpost.from_html(blog_html) for blog_html in dom.find("div", {"class": "cl"}) ] posts.extend(parsed) cnt += BLOG_STEP return sorted(posts, key=lambda x: x.created_ts)
def unreg_header(): return dhtmlparser.parseString(""" <div class="ds_hlavicka" id="3"> <div class="ds_reseni" style="display:none"> </div> 10.2. 21:53 Tomáškova máma <br> <span class="ds_control_sbalit2" id="comment3_toggle2"> <a onClick="schovej_vlakno(3)" title="Schová nebo rozbalí celé vlákno">Rozbalit</a> <a onClick="rozbal_vse(3)" title="Schová nebo rozbalí vše pod tímto komentářem">Rozbalit vše</a> </span> Re: Bolest proxy <div id="comment3_controls"> <a href="/blog/EditDiscussion/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=add&dizId=210591&threadId=3">Odpovědět</a> | <a onClick="schovej_vlakno(3)" id="comment3_toggle1" title="Schová nebo rozbalí celé vlákno" class="ds_control_sbalit3">Sbalit</a> | <a href="#3" title="Přímá adresa na tento komentář">Link</a> | <a href="/EditUser;jsessionid=kufis2spplnh6gu671mxqe2j?action=toBlacklist&bName=Tom%C3%A1%C5%A1kova%20m%C3%A1ma&url=/blog/show/400959#3" title="Přidá autora na seznam blokovaných uživatelů">Blokovat</a> | <a href="/blog/EditRequest/400959;jsessionid=kufis2spplnh6gu671mxqe2j?action=comment&threadId=3" title="Žádost o přesun diskuse, stížnost na komentář">Admin</a> </div> </div> """).find("div")[0]
def test_parseString_cip(): dom = dhtmlparser.parseString("""<html><tag PARAM="true"></html>""", cip=False) assert dom.childs assert len(dom.childs) == 2 assert dom.childs[0].getTagName() == "html" assert dom.childs[1].getTagName() == "html" assert dom.childs[0].isOpeningTag() assert dom.childs[1].isEndTag() assert dom.childs[0].childs assert not dom.childs[1].childs assert dom.childs[0].childs[0].getTagName() == "tag" assert dom.childs[0].childs[0].params assert not dom.childs[0].childs[0].childs assert "param" not in dom.childs[0].childs[0].params assert "PARAM" in dom.childs[0].childs[0].params assert dom.childs[0].childs[0].params["PARAM"] == "true" with pytest.raises(KeyError): dom.childs[0].childs[0].params["param"]
def test_findOneB(): dom = dhtmlparser.parseString(""" <root> <some id="first"> <something id="first"> <xe id="wanted xe" /> </something> <something id="second"> <xe id="another wanted xe" /> <something id="super" /> </something> <xe id="another xe" /> </some> <some id="second"> <something id="last"> <xe id="last xe" /> </something> </some> </root> """) none = dom.findOneB("nono") some = dom.findOneB("some") some2 = dom.findOneB("some", {"id": "second"}) something = dom.findOneB("something", skip=2) another = dom.findOneB("xe") xe = dom.findOneB("xe", skip=1) assert len(some.childs) == 9 assert len(some2.childs) == 4 assert none == None assert something.params["id"] == "last" assert another.params["id"] == "another xe" assert xe.params["id"] == "wanted xe"
def test_findOne(): dom = dhtmlparser.parseString( """ <root> <some id="first"> <something id="first"> <xe id="wanted xe" /> </something> <something id="second"> <xe id="another wanted xe" /> <something id="super" /> </something> <xe id="another xe" /> </some> <some id="second"> <something id="last"> <xe id="last xe" /> </something> </some> </root> """ ) none = dom.findOne("nono") some = dom.findOne("some") some2 = dom.findOne("some", {"id": "second"}) something = dom.findOne("something", skip=2) xe = dom.findOne("xe") assert len(some.childs) == 9 assert len(some2.childs) == 4 assert none == None assert something.params["id"] == "super" assert xe.params["id"] == "wanted xe"
def test_match(): dom = dhtmlparser.parseString( """ <root> <some> <something> <xe id="wanted xe" /> </something> <something> <xe id="another wanted xe" /> </something> <xe id="another xe" /> </some> <some> <something> <xe id="last wanted xe" /> </something> </some> </root> """ ) xe = dom.match("root", "some", "something", "xe") assert len(xe) == 3 assert xe[0].params["id"] == "wanted xe" assert xe[1].params["id"] == "another wanted xe" assert xe[2].params["id"] == "last wanted xe"
def test_match_parameters(): dom = dhtmlparser.parseString( """ <root> <div id="1"> <div id="5"> <xe id="wanted xe" /> </div> <div id="10"> <xe id="another wanted xe" /> </div> <xe id="another xe" /> </div> <div id="2"> <div id="20"> <xe id="last wanted xe" /> </div> </div> </root> """ ) xe = dom.match( "root", {"tag_name": "div", "params": {"id": "1"}}, ["div", {"id": "5"}], "xe" ) assert len(xe) == 1 assert first(xe).params["id"] == "wanted xe"
def test_wfind_multiple_matches(): dom = dhtmlparser.parseString( """ <root> <some> <something> <xe id="wanted xe" /> </something> <something> <xe id="another wanted xe" /> </something> <xe id="another xe" /> </some> <some> <something> <xe id="last wanted xe" /> </something> </some> </root> """ ) xe = dom.wfind("root").wfind("some").wfind("something").wfind("xe") assert len(xe.childs) == 3 assert xe.childs[0].params["id"] == "wanted xe" assert xe.childs[1].params["id"] == "another wanted xe" assert xe.childs[2].params["id"] == "last wanted xe"
def test_wfind_complicated(): dom = dhtmlparser.parseString( """ <root> <some> <something> <xe id="wanted xe" /> </something> <something> asd </something> <xe id="another xe" /> </some> <some> else <xe id="yet another xe" /> </some> </root> """ ) xe = dom.wfind("root").wfind("some").wfind("something").find("xe") assert len(xe) == 1 assert first(xe).params["id"] == "wanted xe" unicorn = dom.wfind("root").wfind("pink").wfind("unicorn") assert not unicorn.childs
def get_publications(): """ Get list of publication offered by ben.cz. Returns: list: List of :class:`structures.Publication` objects. """ data = DOWNER.download(URL) dom = dhtmlparser.parseString(data) book_list = dom.find("div", {"class": "seznamKniha"}) assert book_list, "Can't find <div> with class 'seznamKniha'!" books = [] for html_chunk in book_list: a = html_chunk.find("a") assert a, "Can't find link to the details of the book!" if a[0].find("span", {"class": "ruzek pripravujeme"}): continue books.append( _process_book(a[0].params["href"]) ) return books
def test_findNextB(): dom = dhtmlparser.parseString( """ <root> <div> <something /> <div id=2> <xe /> </div> </div> <div id="three"> </div> <div id=4> <some> <div>foo</div> </some> <div /> </div> </root> """ ) gen = dom.findNextB("div") assert isinstance(gen, GeneratorType) l = [div for div in gen] assert len(l) == 11 assert len(l[0].childs) == 6 assert l[4].params["id"] == "4"
def add_pic(self, opened_file): """ Add picture to the Concept. Args: opened_file (file): opened file object """ # init meta if not self._meta: self._init_metadata() # get link to pic form data = download( url_context(self._meta["Přidej obrázek"]), session=self._session ) dom = dhtmlparser.parseString(data) # get information from pic form form = first(dom.find("form", {"enctype": "multipart/form-data"})) add_pic_url = form.params["action"] # send pic data = self._session.post( url_context(add_pic_url), data={ "action": "addScreenshot2", "finish": "Nahrát" }, files={"screenshot": opened_file} ) data = data.text.encode("utf-8") check_error_div(data, '<div class="error" id="screenshotError">')
def test_match(): dom = dhtmlparser.parseString(""" <root> <some> <something> <xe id="wanted xe" /> </something> <something> <xe id="another wanted xe" /> </something> <xe id="another xe" /> </some> <some> <something> <xe id="last wanted xe" /> </something> </some> </root> """) xe = dom.match("root", "some", "something", "xe") assert len(xe) == 3 assert xe[0].params["id"] == "wanted xe" assert xe[1].params["id"] == "another wanted xe" assert xe[2].params["id"] == "last wanted xe"
def test_wfind_multiple_matches(): dom = dhtmlparser.parseString(""" <root> <some> <something> <xe id="wanted xe" /> </something> <something> <xe id="another wanted xe" /> </something> <xe id="another xe" /> </some> <some> <something> <xe id="last wanted xe" /> </something> </some> </root> """) xe = dom.wfind("root").wfind("some").wfind("something").wfind("xe") assert len(xe.childs) == 3 assert xe.childs[0].params["id"] == "wanted xe" assert xe.childs[1].params["id"] == "another wanted xe" assert xe.childs[2].params["id"] == "last wanted xe"
def test_wfind_complicated(): dom = dhtmlparser.parseString(""" <root> <some> <something> <xe id="wanted xe" /> </something> <something> asd </something> <xe id="another xe" /> </some> <some> else <xe id="yet another xe" /> </some> </root> """) xe = dom.wfind("root").wfind("some").wfind("something").find("xe") assert len(xe) == 1 assert first(xe).params["id"] == "wanted xe" unicorn = dom.wfind("root").wfind("pink").wfind("unicorn") assert not unicorn.childs
def test_predecesors_pattern(): dom = dhtmlparser.parseString( """ <root> <xex> <x>content</x> </xex> </root> """ ) dhtmlparser.makeDoubleLinked(dom) x = dom.find("x")[0] res = path_patterns.predecesors_pattern(x, dom) assert res assert len(res) == 1 assert isinstance(res[0], path_patterns.PathCall) assert res[0].call_type == "match" assert res[0].index == 0 assert res[0].params == [ ["root", None], ["xex", None], ["x", None], ]
def test_neighbours_pattern_text_neigh(): dom = dhtmlparser.parseString( """ asd <xex>\tHello</xex> <xep></xep> asd """ ) dhtmlparser.makeDoubleLinked(dom) xex = dom.find("xex")[0] res = path_patterns.neighbours_pattern(xex) assert res assert len(res) == 2 left, right = res assert left.call_type == "left_neighbour_tag" assert left.index == 0 assert res[0].params.tag_name == "xex" assert res[0].params.params == None assert left.params.fn_params == [None, None, "asd"] assert right.call_type == "right_neighbour_tag" assert right.index == 0 assert res[0].params.tag_name == "xex" assert res[0].params.params == None assert right.params.fn_params == ["xep", None, ""]
def filter_feed(chan_id, filter_item): rss = _download_feed(chan_id) rss_dom = dhtmlparser.parseString(rss) for item in rss_dom.find("entry"): title = _pick_item_property(item, "title") link = _pick_item_property(item, "link") pub_date = _pick_item_property(item, "published") description = _pick_item_property(item, "content") real_link = _parse_description_link(description) if link: link = link.params.get("href", None) result = filter_item( title=title, link=link, real_link=real_link, pub_date=pub_date, description=description, ) if result: item.replaceWith(dhtmlparser.HTMLElement("")) xml = rss_dom.prettify().splitlines() return '<?xml version="1.0" encoding="UTF-8"?>' + "\n".join(xml[1:])
def from_user_id(user_id): """ Transform `user_id` to instance of :class:`User`. Returns: obj: :class:`User` instance parsed from the `user_id`. """ data = shared.download(url_context("/Profile/" + str(user_id))) dom = dhtmlparser.parseString(data) dhtmlparser.makeDoubleLinked(dom) shared.handle_errors(dom) # <li><a href="/lide/unittest/objekty" rel="nofollow">Seznam příspěvků # na abclinuxu.cz</a> a_tags = dom.find( "a", fn=lambda x: x.params.get("href", "").startswith("/lide/") ) # pick only links which have content that starts with Seznam links = [ a_tag.params["href"] for a_tag in a_tags if a_tag.getContent().startswith("Seznam") ] username = links[-1].split("/")[2] return User(username)
def test_match_parameters(): dom = dhtmlparser.parseString(""" <root> <div id="1"> <div id="5"> <xe id="wanted xe" /> </div> <div id="10"> <xe id="another wanted xe" /> </div> <xe id="another xe" /> </div> <div id="2"> <div id="20"> <xe id="last wanted xe" /> </div> </div> </root> """) xe = dom.match("root", { "tag_name": "div", "params": { "id": "1" } }, ["div", { "id": "5" }], "xe") assert len(xe) == 1 assert first(xe).params["id"] == "wanted xe"
def test_findNextB(): dom = dhtmlparser.parseString(""" <root> <div> <something /> <div id=2> <xe /> </div> </div> <div id="three"> </div> <div id=4> <some> <div>foo</div> </some> <div /> </div> </root> """) gen = dom.findNextB("div") assert isinstance(gen, GeneratorType) l = [div for div in gen] assert len(l) == 11 assert len(l[0].childs) == 6 assert l[4].params["id"] == "4"
def edit(self, text, title=None, date_of_pub=None): """ Edit concept. Args: text (str): New text of the context. title (str, default None): New title of the concept. If not set, old title is used. date_of_pub (str/int, default None): Date string in abclinuxu format or timestamp determining when the concept should be automatically published. Note: `date_of_pub` can be string in format ``"%Y-%m-%d %H:%M"``. """ if not self._meta: self._init_metadata() data = download( url_context(self._meta["Uprav zápis"]), session=self._session ) dom = dhtmlparser.parseString(data) form = dom.find("form", {"name": "form"}) assert form, "Can't find edit form!" form = first(form) form_action = form.params["action"] if title is None: title = first(form.find("input", {"name": "title"})) title = title.params["value"] date = "" if date_of_pub is None: date = first(form.find("input", {"name": "publish"})) date = date.params["value"] elif isinstance(date_of_pub, basestring): date = date_of_pub else: date = ts_to_concept_date(date_of_pub) data = download( url=url_context(form_action), method="POST", data={ "cid": 0, "publish": date, "content": text, "title": title, "delay": "Ulož", "action": "edit2" }, session=self._session ) check_error_div(data, '<div class="error" id="contentError">') check_error_page(data)
def test_containsParamSubset(): dom = dhtmlparser.parseString("<div id=x class=xex></div>") div = first(dom.find("div")) assert div.containsParamSubset({"id": "x"}) assert div.containsParamSubset({"class": "xex"}) assert div.containsParamSubset({"id": "x", "class": "xex"}) assert not div.containsParamSubset({"asd": "bsd", "id": "x", "class": "xex"})
def test_parse_description_missing(): html = """ <div class="detailPopis"></div> """ result = ben_cz._parse_description(d.parseString(html)) assert result is None
def test_equality_of_output_with_comment(): inp = """<head> <!-- <link rel="stylesheet" type="text/css" href="style.css"> --> </head> """ dom = dhtmlparser.parseString(inp) assert dom.__str__() == inp
def test_params(): dom = dhtmlparser.parseString("<xe id=1 />") xe = first(dom.find("xe")) assert xe.params["id"] == "1" xe.params = {} assert str(xe) == "<xe />"
def _apply_blacklist(self, text): dom = dhtmlparser.parseString(text) blacklist = dom.find("", fn=lambda x: x.getTagName() in [ "i", "a", "bq", "pre", "italic", "blockquote", ]) for el in blacklist: el.replaceWith(dhtmlparser.parseString("")) return str(dom)
def _get_twitter_button_tag(cls): twitter_button_tag = ( '<a class="twitter-share-button" id="twitter_button" href="#">' '<img src="%s" />' '</a>\n' ) twitter_button_tag = twitter_button_tag % AddStaticFiles.tweet_button_ref return dhtmlparser.parseString(twitter_button_tag)