def test_generate_pdf_with_amendement_content_gouvernemental( app, lecture_senat, article1_senat, amendements_senat): from zam_repondeur.models import DBSession from zam_repondeur.services.import_export.pdf import generate_html_for_pdf amendement_6666 = amendements_senat[0] amendement_6666.auteur = "LE GOUVERNEMENT" amendement_6666.user_content.reponse = "La présentation" DBSession.add(amendement_6666) parser = HTMLParser( generate_html_for_pdf(DummyRequest(), "print/all.html", {"lecture": lecture_senat})) assert ( parser.css_first(".first-page .lecture").text() == "Sénat, session 2017-2018, Séance publique, Numéro lecture, texte nº\xa063" ) assert _html_page_titles(parser) == [ "Article 1", "Réponse", "Amendement nº 6666" ] response_node = parser.css_first(".reponse") assert _cartouche_to_list(response_node) == [ "Article", "Art. 1", "Amendement", "6666", "Auteur", "Gouvernement", ] assert response_node.css_first("div h5").text() == "Réponse" assert "La présentation" in response_node.css_first("div p").text()
def scrape(self): super().scrape() articles = [] for i, URL in enumerate(self.links): try: r = urllib.request.urlopen(URL) except: print('Skipping:', URL) continue sll = HTMLParser(r.read()) print(i + 1, '/', len(self.links), URL) headline = sll.css_first( 'meta[name="dc.title"]').attributes['content'] timestamp = parse( sll.css_first( 'meta[name="dcterms.created"]').attributes['content']) main_article = sll.css_first('.article-body') story = {} story['content'] = [] story['headline'] = headline story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S") story['url'] = URL story['journal'] = self.journal for paragraph in main_article.css('p'): story['content'].append(paragraph.text(deep=True, separator='')) articles.append(story) self.output = articles
def getDefinition(terme): termeHttp = terme termeHttp = urllib.parse.quote_plus(termeHttp, encoding='iso-8859-1') url = 'http://www.jeuxdemots.org/rezo-dump.php?gotermsubmit=Chercher&gotermrel=' + termeHttp + '&rel=1' html = r.get(url) #print (terme + " " +termeHttp) tree = HTMLParser(html.text) try: definition = tree.css_first('def').text() code = tree.css_first('CODE').text() except AttributeError as error: print("Le mot" + terme + " n'existe pas", error) return print("Definition pour " + terme + ": \n" + definition) #code = re.sub(r'(?m)^ *//.*\n?', '', code) #code = code.strip("\n") #code = code.split(";") #writer = csv.writer(open(terme+".csv","w")) #writer.writerow(code) match = re.findall(r"(" + terme + "\>[^0-9].*)", code) match.reverse() #print(code) #print(match) for m in match: m = m[:-1] # il faut corriger dans expressions reguliers (m') !! getDefinition(m)
def scrape(self): super().scrape() articles = [] for i, URL in enumerate(self.links): try: r = urllib.request.urlopen(URL) sll = HTMLParser(r.read()) print(i+1,'/',len(self.links),URL) headline = sll.css_first('meta[property="og:title"]').attributes['content'] main_article = sll.css_first('section[id="body-text"],.Article__content') timestamp = parse(sll.css_first('meta[name="pubdate"],meta[property="og:pubdate"]').attributes['content']) story = {} story['content'] = [] story['headline'] = headline story['time-stamp'] = timestamp.strftime("%m/%d/%Y, %H:%M:%S") story['url'] = URL story['journal'] = self.journal for paragraph in main_article.css('.zn-body__paragraph,.Paragraph__component'): story['content'].append(paragraph.text(deep=True, separator='')) articles.append(story) except: print('Skipping:',URL) continue self.output = articles
async def resolve_link(self, url): parser = HTMLParser(await self.session(custom_url=url)) tid = int( parser.css_first("a.mfd-link-dotted").attributes['href'].split( '?threadId=')[1]) name = parser.css_first("div.mfd-header").text().strip() return tid, name
def detect_lang_worker(filepath): # read file with open(filepath, 'rb') as f: page_raw = f.read() tree = HTMLParser(page_raw) text = "" # detect lang by description description = tree.css_first("meta[property=\"og:description\"]") if description: text = description.attributes['content'].strip() # or by title if not text: title = tree.css_first("meta[property=\"og:title\"]") text = title.attributes['content'] # detect lang precisiously try: lang = simpletools.detectLang(text, detect_all=True) except: return None, None return lang, os.path.basename(filepath)
def core_course_get_contents(self, username, password, course_id): "Kursdagi mavzularni ko'rsatish uchun govno kod" page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) counter = 1 contents = [] "Kursdagi barcha mavzularni olamiz" ''' >>> s='' >>> for node in page.css("li"): ... if 'id' in node.attributes: ... if node.attributes['id'][:7]=='section' and node.attributes['id']!='sectio n-0': print(node.child.css_first("span").text()); s+= node.child.css_first('span').text() ''' for tag in page.tags("li"): if 'id' in tag.attributes: if tag.attributes['id'][:7] == "section" and tag.attributes[ 'id'] != "section-0": if not ('resource' in tag.html): continue section = HTMLParser(tag.html) contents.append( str(counter) + ". " + section.css_first("span").text()) counter += 1 if contents == []: contents = ["Bu yerda yuklanadigan hech narsa yo'q :/"] return contents
async def resolve_link(self, url): parser = HTMLParser(await self.session(custom_url=url)) name = parser.css_first("div.mfd-header h1").text().strip().split( ' ')[-1] tid = int( parser.css_first("div.mfd-header div a").attributes['href'].split( '?id=')[1]) return tid, name
def core_course_get_tasks(self, username, password, course_id): #Topshiriq mavzularini ko'rsatadi "Topshiriqlar list shaklida qaytadi, masalan: [1-dedline 6280, ..., vazifa uning_idsi]" page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) tasks = [] counter = 1 ids = [] #Yana li tegiga murojaat qilamiz, chunki topshiriqlar sectionlarni ichida berilgan bo'ladi #if lar ko'payib ketarkan, optimal yo'lini qidirish kerak for tag in page.tags('li'): if 'id' in tag.attributes: if tag.attributes['id'][:7] == 'section': if not ('http://moodle.fbtuit.uz/mod/assign/view.php?' in tag.html): continue section = HTMLParser(tag.html) theme = section.css_first('span').text() + '\n' for tag1 in section.tags('a'): if not ('http://moodle.fbtuit.uz/mod/assign/view.php?' in tag1.attributes['href']): continue #if 'section' in tag1.attributes['id']: continue #tasks.append("├"+str(counter)+". "+tag1.text().replace("\n"," ")+" "+tag1.css_first('input').attributes['value']) #tasks.append("├"+str(counter)+". "+tag1.css_first('span').text()+" "+tag1.css_first('input').attributes['value']) if tag1.attributes['href'][tag1.attributes['href']. rfind("=") + 1:] in ids: continue tasks.append(theme + "├" + str(counter) + ". " + tag1.text() + " " + tag1.attributes['href'] [tag1.attributes['href'].rfind("=") + 1:]) ids.append(tag1.attributes['href'] [tag1.attributes['href'].rfind("=") + 1:]) counter += 1 theme = '' '''s = '' for input_tag in tag1.css('input'): if input_tag.attributes['name']!='modulename' and input_tag.attributes['name']!='id': continue s = input_tag.attributes['value']+" "+s tasks[-1]+=s''' tasks[-1] = tasks[-1].replace("├", "└") if tasks == []: return ["Bu yerda topshiriqlar yo'q :)"] return tasks
def parse_html(path: str) -> dict: """ Parses from HTML: - key - title - issns (list) - wikidata_qid - homepage_url - acronym (?) TODO: publisher? """ key = path.replace('.html', '') if not len(key.split('/')) == 2: print(key, file=sys.stderr) return {} meta = dict(dblp_prefix=key, issns=[]) try: with open(path, 'r') as html_file: doc = HTMLParser(html_file.read()) except FileNotFoundError: return {} elem = doc.css_first('header#headline h1') if elem and elem.text(): meta['title'] = elem.text() if meta['title'].endswith(')') and meta['title'].count('(') == 1: meta['acronym'] = meta['title'].split('(')[-1][:-1] meta['title'] = meta['title'].split('(')[0].strip() # <a href="https://portal.issn.org/resource/issn/2624-8212" itemprop="sameAs"> # <a href="https://www.wikidata.org/entity/Q15753736" itemprop="sameAs"> elems = doc.css('header#headline a[itemprop="sameAs"]') or [] for elem in elems: if not elem.attributes.get('href'): continue url = elem.attributes['href'] if "://portal.issn.org/" in url: issn = url.split('/')[-1].strip() if len(issn) == 9: meta['issns'].append(issn) else: print(issn, file=sys.stderr) elif "://www.wikidata.org/entity/Q" in url: meta['wikidata_qid'] = url.split('/')[-1] assert 'Q' in meta['wikidata_qid'] # <a href="https://journals.sagepub.com/home/hfs" itemprop="url"><img alt="" src="https://dblp.org/img/home.dark.16x16.png" class="icon" />web page @ sagepub.com</a> elem = doc.css_first('header#headline a[itemprop="url"]') if elem and elem.attributes.get('href'): meta['homepage_url'] = elem.attributes['href'] return meta
def parse_topic_content(page_content: str, url: str): root = HTMLParser(page_content) topic_content = root.css_first( '.bg1 > .inner > .postbody > * > .content').text() topic_closed = bool(root.css_first('.fa-lock')) data_container = TopicData(content=topic_content, url=url, closed=topic_closed) data_container.process() return data_container
def test_generate_pdf_with_amendement_content_factor_only_groups( app, lecture_senat, article1_senat, amendements_senat): from zam_repondeur.models import DBSession from zam_repondeur.services.import_export.pdf import generate_html_for_pdf amendement_6666 = amendements_senat[0] amendement_6666.auteur = "M. JEAN" amendement_6666.groupe = "Les Indépendants" amendement_6666.user_content.avis = "Favorable" amendement_6666.user_content.objet = "L’objet" amendement_6666.user_content.reponse = "La réponse" DBSession.add(amendement_6666) amendement_9999 = amendements_senat[1] amendement_9999.auteur = "M. CLAUDE" amendement_9999.groupe = "Les Indépendants" amendement_9999.user_content.avis = "Favorable" amendement_9999.user_content.objet = "L’objet" amendement_9999.user_content.reponse = "La réponse" DBSession.add(amendement_9999) parser = HTMLParser( generate_html_for_pdf(DummyRequest(), "print/all.html", {"lecture": lecture_senat})) assert ( parser.css_first(".first-page .lecture").text() == "Sénat, session 2017-2018, Séance publique, Numéro lecture, texte nº\xa063" ) assert _html_page_titles(parser) == [ "Article 1", "Réponse", "Amendement nº 6666", "Amendement nº 9999", ] response_node = parser.css_first(".reponse") assert _cartouche_to_list(response_node) == [ "Article", "Art. 1", "Amendements", "6666 et 9999", "Auteurs", "M. CLAUDE et M. JEAN", "Groupes", "Les Indépendants", "Avis", "Favorable", ] assert response_node.css_first("div h5").text() == "Objet" assert "L’objet" in response_node.css_first("div p").text() assert response_node.css("div h5")[-1].text() == "Réponse" assert "La réponse" in response_node.css("div p")[-1].text()
def scrape(self): super().scrape() articles = [] for i, URL in enumerate(self.links): try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=URL, headers=headers) r = urllib.request.urlopen(req) except: print('Skipping:', URL) continue sll = HTMLParser(r.read()) print(i + 1, '/', len(self.links), URL) headline = sll.css_first('.news-title') main_article = sll.css('.text.en > p') timestamp = sll.css_first('.date') try: headline = headline.text(deep=True, separator='').strip() timestamp = timestamp.text(deep=True, separator='').strip() timestamp = timestamp.encode('ascii', 'ignore').decode('utf-8') except: headline = '' timestamp = '' main_article = '' story = {} story['content'] = [] story['headline'] = headline story['time-stamp'] = timestamp # 2020-03-11 16:24:48 story['url'] = URL story['journal'] = self.journal for paragraph in main_article: line = paragraph.text(deep=True, separator='') line = line.strip() line = line.encode('ascii', 'ignore').decode('utf-8') if line: story['content'].append(line) articles.append(story) self.output = articles
async def get_weather(location: str) -> str: async with aiohttp.request('GET', domain + weather_request + quote(location), headers={'User-Agent': user_agent}) as resp: search_text = await resp.text() title = HTMLParser(search_text).css_first('title').text() possible_href = str(resp.url) if title != 'Яндекс.Погода': # if we got rerouted to weather weather_text = search_text exact_location = '' for node in HTMLParser(weather_text).css('span.breadcrumbs__title'): exact_location += node.text() + ',' exact_location = exact_location[:-1] href = possible_href else: # if we got location list as we expected node = HTMLParser(search_text).css_first('div.grid__cell') if node is None: return f'По запросу "{location}" ничего не найдено' node = node.css_first('li.place-list__item') node = node.css_first('a') href = domain + node.attributes['href'] exact_location = node.text() async with aiohttp.request('GET', href, headers={'User-Agent': user_agent}) as resp: weather_text = await resp.text() # parsing weather card = HTMLParser(weather_text).css_first('div.content__main').css_first('div.content__row').css_first('div.card') temp_info = card.css_first('div.fact__temp-wrap').css_first('a') now_temp = temp_info.css_first('div.fact__temp').css_first('span.temp__value').text() now_condition = temp_info.css_first('div.fact__feelings').css_first('div.link__condition').text() wind_info = card.css_first('div.fact__props').css_first('dl.fact__wind-speed').css_first('dd.term__value') now_wind = wind_info.css_first('span.wind-speed').text() + ' ' + wind_info.css_first('span.fact__unit').text() day_info = HTMLParser(weather_text).css_first('div.forecast-briefly').css_first('div.swiper-wrapper') # print(day_info.html) slide = None for day in day_info.css('div.swiper-slide'): text: str = day.text() if text.find('Сегодня') != -1: slide = day.css_first('a') day_temp = slide.css_first('div.forecast-briefly__temp_day').css_first('span.temp__value').text() night_temp = slide.css_first('div.forecast-briefly__temp_night').css_first('span.temp__value').text() condition = slide.css_first('div.forecast-briefly__condition').text() return f'Место: {exact_location}' \ f'\n\nCЕЙЧАС:\nТемпература: {now_temp}\nСостояние: {now_condition}\nВетер: {now_wind}' \ f'\n\nCЕГОДНЯ:\nТемпература днем: {day_temp}\nТемпература ночью: {night_temp}\nСостояние: {condition}'\ f'\n\nПолный прогноз: {href}'
async def google_it(query: str, how_many: int = 1) -> str: async with aiohttp.request('GET', base_query + quote(query), headers={'User-Agent': user_agent}) as resp: text = await resp.text() i = 0 results = [] search_result_node = HTMLParser(text).css_first('div[eid]') if search_result_node is None: return 'Ничего не нашел' nodes = search_result_node.css_first('div > div.srg').css_first( 'div.srg').css('div.g') for node in nodes: node = node.css_first('div[data-ved]').css_first('div.rc') header_node = node.css_first('div.r').css_first('a') url = header_node.attributes['href'] header_node = node.css_first('h3').css_first('div') title = header_node.text().strip() print(f'{i}: {title} {url}') results.append(f'Описание: {title}\nСсылка: {url}\n') i += 1 if len(results) > 0: return '\n'.join(results[:how_many]) else: return 'Ничего не нашел'
async def check_update(self) -> Page: parser = HTMLParser(await self.session(), "html.parser") post = parser.css_first("div.trt").html return Page([ SinglePost(md=(await self.pretty_text(post)).strip(), title=self.title) ])
def process(self, query): html = r.get(self.buildUrl(query.term)) tree = HTMLParser(html.text) code_tag = tree.css_first('CODE') if not code_tag: return None code_text = code_tag.text() definition_tag = tree.css_first('def') definition = '' if definition_tag: definition = definition_tag.text() return self.processGet(query.term, query.properties, code_text, definition, query)
def get_imdb_page(show): global requests logging.info("Scraping information for show: " + (show)) # We want to query imdb one time url = 'https://www.imdb.com/search/title?title=' + show + '&title_type=tv_series,tv_miniseries&sort=popularity' # Making a response and parsing it response = get(url, headers=headers) if response.status_code != 200: logging.warning('Received status code, ' + str(response.status_code)) raise Exception("Received a non-200 status code!") parser = HTMLParser(response.text) # Update progress bar and wait requests += 1 elapsed_time = time() - start_time os.system('clear') print('Request: {}; Frequency: {} requests/s'.format( requests, requests / elapsed_time)) # We only care about the divs that have the movie name # imdb_page has the link to the tv show's imdb page if (len(parser.css(".lister-item-header a")) <= 0): logging.warning('Did not find any results for: ' + show) raise Exception("Did not find a valif imdb page") imdb_page = "https://www.imdb.com" + parser.css_first( ".lister-item-header a").attributes['href'] return imdb_page
def test_relative_url(): tree = HTMLParser('<html><a href="/test/relative">Testing</a></html>') href_node = tree.css_first("a") base = "https://www.google.com/tester1" ahref = Ahref(href_node, base) assert ahref.absolute_url == "https://www.google.com/test/relative"
def core_course_get_courses(self, username, password): page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/my/").text ) #todo bundan ham optimal qilish mumkin :/ if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return [] #Parol xato bo'lsa bo'sh list qaytaradi page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/my/").text) "dasturlash 136 shaklida bo'ladi har bir list elementi" "kursnomi idsi" course_list = [] ''' Parse ni quyidagi qism bo'yicha amalga oshiradi: <div class="media-body"> <h4 class="h5"><a href="http://moodle.fbtuit.uz/course/view.php?id=173" class="">Kurs nomi</a></h4> </div> ''' for node in page.css("div"): if 'class' in node.attributes: if node.attributes['class'] == 'media-body' and node.text( ) != '': if node.text().strip( ) + " " + node.css_first('a').attributes['href'][ node.css_first('a').attributes['href'].find("=") + 1:] in course_list: break course_list.append( node.text().strip() + " " + node.css_first('a').attributes['href'] [node.css_first('a').attributes['href'].find("=") + 1:]) return course_list
def core_course_get_files(self, username, password, course_id, section): page = HTMLParser( self.session.get("http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return 0xff page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/course/view.php?id=" + str(course_id)).text) #li teglari oralig'ini tekshiradi, mavzular va fayllar o'sha yerda for tag in page.tags('li'): if 'id' in tag.attributes: if tag.attributes['id'] == 'section-' + str(section): page = HTMLParser(tag.html) #aaaaaaa break links = [] for tag in page.tags('a'): if not (tag.attributes['href'] in links): links.append(tag.attributes['href']) try: os.mkdir(os.getcwd() + "/temp") except Exception as e: pass for link in links: if not ('resource' in link): continue resp = self.session.get(link, allow_redirects=True) file_name = resp.url[resp.url.rfind("/") + 1:] file_name = unquote(file_name) if 'view.php' in file_name: continue with open(os.getcwd() + "/temp/" + file_name, 'wb') as file: file.write(resp.content) return 0
def core_course_get_grades( self, username, password ): #, course_id): keyinchalik kurslarni o'zidagi baholarni batafsil ko'rsatadigan qilish "Kurs id si bo'yicha baholarni aytadi" "Buni sal chiroyliroq qilib qo'ygin-ey :)" page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/grade/report/overview/index.php").text ) if page.css_first( 'title').text() == "TATUFF Masofaviy ta'lim: Вход на сайт": if not self.core_auth_confirm_user(username, password): return "" page = HTMLParser( self.session.get( "http://moodle.fbtuit.uz/grade/report/overview/index.php"). text) grades = "Kurs nomi|Baho\n" #td - table tegi, table dan baholarni olamiz counter = 0 for node in page.css('td'): if node.text() == "": return grades grades += node.text() counter += 1 if counter % 2: grades += "|" else: grades += "\n"
def SelectTitle(self): """ Ask for CSS title selector """ titleSelect = self.CONFIG.GetTitleCSS() if titleSelect == "": print( "Title Selector unspecified, please add a css selector for the game titles under SiteInfo > TitleSelector in the configuration file: {}" .format(self.CONFIG.filename)) quit() #print("TODO: PREVIEW THIS SELECTOR") pageContents, last_status_code = "", -1 while last_status_code != 200: target = self.links[0] try: last_status_code, pageContents = self.getContents(target) except requests.exceptions.ConnectionError as e: print("Unable to connect, retrying") continue titletree = HTMLParser(pageContents) titlepreview = titletree.css_first(titleSelect).text() print("Title Preview:") print(titlepreview) confirm = "" while confirm != "y" and confirm != "n": print("Does this game's title match? {} (y)es/(n)o".format( self.links[0])) confirm = input() if confirm == "n": print( "Please reconfigugre your the css selector for the game titles under SiteInfo > TitleSelector in the configuration file: {}" .format(self.CONFIG.filename)) quit()
def test_node_comparison_fails(): html = """<div id="test"></div>""" html_parser = HTMLParser(html) node = html_parser.css_first('#test') assert node != None assert node != 123 assert node != object
def scrape_podcast(link): print('[*] Scraping', base_url + link) selectolax = HTMLParser(requests.get(base_url + link, headers=headers).content) dl_link = base_url + str(selectolax.css_first('a.btn.btn-default.subscribe-btn.btn-sm').attrs['href']) file_name = dl_link.split('/')[-1] print('[+] Downloading', file_name) with open(output_dir + file_name, 'wb') as file: file.write(requests.get(dl_link, headers=headers).content)
def html_guess_platform(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata]) -> Optional[str]: generator: Optional[str] = None generator_elem = doc.css_first("meta[name='generator']") if generator_elem: generator = generator_elem.attrs["content"] else: generator_elem = doc.css_first("a[id='developedBy']") if generator_elem: generator = generator_elem.text() if generator and "open journal systems 3" in generator.lower(): return "ojs3" elif generator and "open journal systems" in generator.lower(): return "ojs" elif generator and "plone" in generator.lower(): return "plone" elif generator and "wordpress" in generator.lower(): return "wordpress" elif generator and "blogger" in generator.lower(): return "blogger" elif doc.css_first("body[id='pkp-common-openJournalSystems']"): return "ojs" else: try: if ('powered by <a target="blank" href="http://pkp.sfu.ca/ojs/">PKP OJS</a>' in doc.html): return "ojs" if 'Powered by <a target="_blank" href="http://arphahub.com">' in doc.html: return "arpha" if "<meta property='og:image' content='http://cms.galenos.com.tr' />" in doc.html: return "galenos" except UnicodeDecodeError: pass icon_elem = doc.css_first("link[type='image/x-icon']") if icon_elem and "href" in icon_elem.attrs: if "journalssystem.com" in icon_elem.attrs["href"]: return "journalssystem.com" elif "indexcopernicus.com" in icon_elem.attrs["href"]: return "indexcopernicus" if "scielo" in url: return "scielo" return None
def test_node_insert_after(): html_parser = HTMLParser( '<div>Get <span alt="Laptop"><img src="/jpg"> <div></div></span></div>' ) html_parser2 = HTMLParser('<div>Test</div>') img_node = html_parser.css_first('img') img_node.insert_after(html_parser2.body.child) assert html_parser.body.child.html == '<div>Get <span alt="Laptop"><img src="/jpg"><div>Test</div> <div></div></span></div>'
def test_external_url(): tree = HTMLParser( '<html><a href="https://www.getevents.nl/tester">Getevents</a></html>' ) href_node = tree.css_first("a") base = "https://www.google.com/getevents" ahref = Ahref(href_node, base) assert not ahref.is_internal
def get_html_title(path): with open(path, 'r', encoding='utf8', errors='ignore') as f: html = f.read() tree = HTMLParser(html) node = tree.css_first('title') if node: text = node.text(deep=False) return text if text else None # Return title text if title tag exists else: return None # Else, return None
def test_attrs_test_dict_features(): html_parser = HTMLParser('<div id="id" v data-id="foo"></div>') node = html_parser.css_first('div') node.attrs['new_att'] = 'new' assert list(node.attrs.keys()) == ['id', 'v', 'data-id', 'new_att'] assert list(node.attrs.values()) == ['id', None, 'foo', 'new'] assert len(node.attrs) == 4 assert node.attrs.get('unknown_field', 'default_value') == 'default_value' assert 'id' in node.attrs assert 'vid' not in node.attrs