def parse_and_extract(url: str, year: int = current_year): html_txt = url_to_file(url) r_html = HTML(html=html_txt) table_class = ".imdb-scroll-table" r_table = r_html.find(table_class) # print(r_table) table_data = [] # table_dict_data = {} headers = [] if len(r_table) == 1: # print(r_table[0].text) parsed_table = r_table[0] rows = parsed_table.find('tr') header_row = rows[0] header_col = header_row.find('th') headers = [head.text for head in header_col] for row in rows[1:]: # print(row.text) cols = row.find('td') row_data = [] # row_dict_data = {} for i, col in enumerate(cols): # print(i, col.text, '\n\n') # header_name = header_names[i] # row_dict_data[header_name] = col.text row_data.append(col.text) table_data.append(row_data) # table_dict_data.append(row_dict_data) # print(headers) # print(table_data[0]) path = os.path.join(BASE_DIR, 'data') os.makedirs(path, exist_ok=True) filepath = os.path.join(path, f'{year}.csv') df = pd.DataFrame(table_data, columns=headers) # df = pd.DataFrame(table_dict_data) df.to_csv(filepath, index=False)
def parse_and_extract(name=None): html_text = url_to_text(save=True, name=name) r_html = HTML(html=html_text) table_class = ".table" r_table = r_html.find(table_class) #table_data = [] table_data_dicts = [] header_names = [] if len(r_table) == 0: return False parsed_table = r_table[0] rows = parsed_table.find("tr") header_row = rows[0] header_cols = header_row.find("th") for x in header_cols: header_names.append(x.text) #header_names = [x.text for x in header_cols] #print(header_names) for row in rows[1:]: cols = row.find("td") #row_data =[] row_dict_data = {} for i, cols in enumerate(cols): #print(i, cols.text, "\n") """dictionaries can be used in case all the header_names are unique but in case of common header names the data will be overwritten for programme with lists refer to scrape.py""" header_name = header_names[i] row_dict_data[header_name] = cols.text #row_data.append(cols.text) print(row_dict_data) #table_data.append(row_data) table_data_dicts.append(row_dict_data) #df = pd.DataFrame(table_data, columns = header_names) df = pd.DataFrame(table_data_dicts) path = os.path.join(Base_dir, 'data') os.makedirs(path, exist_ok=True) filepath = os.path.join(path, f"{name}.csv") df.to_csv(filepath, index=False) return True
def parse_html(self): html = HTML(html=self.origin_request.content.decode( 'utf-8', 'ignore').encode('utf-8'), url=self.origin_request.url) for item in html.find('.latnewslist'): url = item.absolute_links.pop() origin_id = int(url.split('/')[-1]) _id = "{0}_{1}".format(self.origin, origin_id) title = item.find('.entry > a > h3', first=True).text.strip() if _id not in self.done_ids and self.is_title_needed(title): notice = { 'id': _id, 'url': url, 'origin': constants.ORIGINS[self.origin], 'origin_id': origin_id, 'title': title, } notice_detail = HTML(html=self.session.get(url).content.decode( 'utf-8', 'ignore').encode('utf-8'), url=url) notice['content'] = notice_detail.find('.dtl-content', first=True).text end_index = notice['content'].index('上一篇') if end_index > 0: notice['content'] = notice['content'][:end_index] notice['posted_at'] = int( datetime.datetime.strptime( notice_detail.find('.new-dtl-info > span', first=True).text[:19], '%Y-%m-%d %H:%M:%S').timestamp()) notice['short_content'] = item.find('.news-brief', first=True).text self.update_line(notice)
def parse(): session = HTMLSession() r = session.get(REQUEST_URL, headers={"accept": "application/json"}) threads = r.json().get("threads") parsed_data = [] for thread in threads[1::]: timestamp = thread.get("posts")[0].get("timestamp") timestamp = datetime.fromtimestamp(timestamp).astimezone(tz) if timestamp > datetime.now().astimezone(tz) - timedelta(hours=1): thread_number = thread.get("thread_num") thread_subject = thread.get("posts")[0].get("subject") text = thread.get("posts")[0].get("comment") text = HTML(html = text) links = text.find("a") text = text.html text = text.replace("<br>", "\n") # replacing line breaks to markdown ones text = bs4.BeautifulSoup(text, features="lxml").get_text() # replacing html links to markdown ones for link in links: href = link.attrs["href"] text = text.replace(href, ("[" + href + "](" + href + ")")) thread_files = thread.get("posts")[0].get("files") thread_files = [thread_file.get("path") for thread_file in thread_files] if thread_files else None thread_link = "2ch.hk/news/res/" + str(thread_number) + ".html" text = __format_text(text, thread_subject, thread_link) parsed_data.append(ThreadInfo(thread_number, timestamp, thread_subject, text, thread_files, thread_link)) return parsed_data
def test_bare_render(): doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): assert value in val assert html.find('html') assert 'https://httpbin.org' in html.links
def extract_data(html_text): r_html = HTML(html=html_text) table_id = '#customers' r_table = r_html.find(table_id) parsed_table = r_table[0] rows = parsed_table.find('tr') header_row = rows[0] header_names = [col.text for col in header_row.find('th')] table_data = [] for row in rows[1:]: cols = row.find('td') row_data = [] for col in cols: row_data.append(col.text) table_data.append(row_data) return header_names, table_data
def get_overall_stats(self): html = HTML(html=self.page) counters = html.find('#maincounter-wrap') main_counter = [] cases = {} for entry in counters: main_counter.append( (int(entry.text.split('\n')[1].replace(',', '')))) cases["Total cases"] = main_counter[0] cases["Total recovered"] = main_counter[2] cases["Total deaths"] = main_counter[1] cases["Active cases"] = cases["Total cases"] - cases[ "Total recovered"] - cases["Total deaths"] cases["Mortality ratio"] = round( (cases["Total deaths"] * 100 / cases["Total cases"]), 2) return (cases)
def test_get_items_from_page(self, HTMLSession): html = HTML(html=load_html('basepage')) HTMLSession.return_value.get.return_value = DummyResponse(html) spell = self._create_spell({'itemListSelector': '#unselect'}) result = spell._get_items_from_page('test_url') for item in result: assert item
def test_bare_js_eval(): doc = """ <!DOCTYPE html> <html> <body> <div id="replace">This gets replaced</div> <script type="text/javascript"> document.getElementById("replace").innerHTML = "yolo"; </script> </body> </html> """ html = HTML(html=doc) html.render() assert html.find('#replace', first=True).text == 'yolo'
def test_html_to_list_valid_html(self): """Should convert requests_html.HTML instance to `list`""" html = HTML(html=" \ <tr><th>A</th><th>V</th></tr> \ <tr><td>a1</td><td>v1</td></tr> \ <tr><td>a2</td><td>v2</td></tr> \ ") self.assertEqual(html_to_list(html), [['A', 'V'], ['a1', 'v1'], ['a2', 'v2']])
def _get_next_page(self, current_page: HTML): next_page_href = current_page.find('a', containing='>>', first=True) next_page_html = self._get_url_content('{url}{next_page}'.format( url=self.url, next_page=next_page_href.attrs['href'])) if current_page.url == next_page_html.url: return None return next_page_html
def parse(text): html = HTML(html=text) # find login pulldown = html.find("#account_pulldown", first=True) if not pulldown: raise UnknownBackendResponse() login = pulldown.text # find steam id variable = 'g_steamID = "' start = text.find(variable) if start == -1: raise UnknownBackendResponse() start += len(variable) end = text.find('";', start) steam_id = text[start:end] return steam_id, login
def to_dict(htmltext): # if not isinstance(htmltext, six.string_types): # raise TypeError("Except str got {}".format(type(htmltext).__name__)) script_str = 'define("detail"' script_str1 = "define('detail'" text = HTML(html=htmltext) try: key_word_obj = text.find("script", containing=script_str)[0].text except: key_word_obj = text.find("script", containing=script_str1)[0].text key_word_obj = key_word_obj.replace("\\", "") key_word = re.search(r'return(.*?})', key_word_obj, re.S).group(1) key_word = key_word.replace("\\", "") keyword_dict = demjson.decode(json.dumps(key_word, ensure_ascii=False)) if isinstance(keyword_dict, str): keyword_dict = demjson.decode(keyword_dict) print(type(keyword_dict)) return keyword_dict
def parse(text): html = HTML(html=text) rows = html.find(".achieveRow") achievements = [] try: for row in rows: unlock_time = row.find(".achieveUnlockTime", first=True) if unlock_time is None: continue unlock_time = int( self.parse_date(unlock_time.text).timestamp()) name = row.find("h3", first=True).text achievements.append((unlock_time, name)) except (AttributeError, ValueError, TypeError): logging.exception("Can not parse backend response") raise UnknownBackendResponse() return achievements
def parse_article_content(doc): html=HTML(html=doc) entry = html.find('div.bbs-screen') content = entry[0].text content = content.replace("【1.請注意兩日內僅能徵、賣、估各1篇,切勿2PO or 以上 】","") content = content.replace("【2.非本板討論範圍請勿PO文(詳細規定請看置底板規) 】","") content = content.replace("【3.確定無誤再發文,發現有誤請大T修標題大E修內文 】","") content = content.replace("【4.無用的整行文字 (例此行以上) 可按「Ctrl+Y」刪除整行】","") content = content.replace("【5.賣出後勿清空內文、標題、價格,違者水桶2個月 】","") content = content.replace("【6.勿刪除他人推文,違者退文並水桶1個月 】","") content = content.replace("【7.請 先 按 「Ctrl+V」!! 還原色碼後,方可正常編輯 】","") content = content.replace("(沒有明確價格、賣出後清空價格,水桶2個月)","") content = content.replace("(購買日期、保固有無、使用期間、新舊程度)","") content = content.replace("(官方規格、網拍連結、實物品樣照片)","") content = content.replace("(自取、面交、郵寄、宅急便)","") content = content.replace("(限面交者請交待詳細區域地點!!)","") content = content.replace("(站內信、手機、即時通訊軟體、如何稱呼)","") return content
def get_page_count(): """ Get the total number of pages (as integer) from the main "Auction History" page: """ global max_page_hour time.sleep(1) main_r = requests.get(root_url, headers=request_headers) if main_r.status_code == 200: main_r_html = HTML(html=main_r.text) page_numbers = main_r_html.find(".PageLink") main_r.close() max_page = int(list(page_numbers[-1].links)[0].split("page=")[-1]) max_page_hour = datetime.datetime.now().hour return max_page else: main_r.close() return 0
def test_create_task(self): """Check if task will be created.""" self.client.post("/new-task", data=dict(task_id="foobar", description="barfoo", src="http://foo.bar/mets.xml", workflow_id=self.get_workflow_id(), default_file_grp="file_grp")) task = models.Task.get() response = self.client.get("/tasks") html = HTML(html=response.data) assert len(html.find('table > tr > td')) == COLUMN_COUNT assert html.find('table > tr > td')[2].text == "file_grp" status_col_txt = html.find('table > tr > td')[6].text assert status_col_txt.startswith("CREATED") assert "worker_task.id" in status_col_txt self.client.get(f"/task/delete/{ task.uid }")
def test_bare_js_eval(): doc = """ <!DOCTYPE html> <html> <body> <div id="replace">This gets replaced</div> <script type="text/javascript"> document.getElementById("replace").innerHTML = "yolo"; </script> </body> </html> """ html = HTML(html=doc) html.render() assert html.find('#replace', first=True).text == 'yolo'
def test_bare_render(): doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): assert value in val assert html.find('html') assert 'https://httpbin.org' in html.links
def parse(text, user_profile_url): html = HTML(html=text) # find persona_name div = html.find("div.profile_header_centered_persona", first=True) if not div: fallback_div = html.find("div.welcome_header_ctn") if fallback_div: logger.info("Fresh account without set up steam profile.") raise UnfinishedAccountSetup() logger.error( "Can not parse backend response - no div.profile_header_centered_persona" ) raise UnknownBackendResponse() span = div.find("span.actual_persona_name", first=True) if not span: logger.error( "Can not parse backend response - no span.actual_persona_name" ) raise UnknownBackendResponse() persona_name = span.text # find steam id variable = 'g_steamID = "' start = text.find(variable) if start == -1: logger.error( "Can not parse backend response - no g_steamID variable") raise UnknownBackendResponse() start += len(variable) end = text.find('";', start) steam_id = text[start:end] # find miniprofile id profile_link = f'{user_profile_url}" data-miniprofile="' start = text.find(profile_link) if start == -1: logger.error( "Can not parse backend response - no steam profile href") raise UnknownBackendResponse() start += len(profile_link) end = text.find('">', start) miniprofile_id = text[start:end] return steam_id, miniprofile_id, persona_name
def get_root(): res = requests.get(root_url) text = res.text html_page = HTML(html=text, url=root_url) links = html_page.links pattern = '\?start=.+' for link in links: if re.search(pattern, link): pages_list.append(link) parse_page(text)
def get_questions(tag: str, days: int): to_date = datetime.date.today() from_date = to_date - datetime.timedelta(days) url = "https://api.stackexchange.com/docs/questions" params = { 'order': 'desc', 'sort': 'activity', 'filter': 'default', 'tagged': 'python', 'site': 'stackoverflow', 'fromdate': f'{from_date}', 'todate': f'{to_date}' } response = requests.get(url, params=params) html_str = response.text html = HTML(html=html_str) questions_summaries = html.find('.question-summary') print(questions_summaries) print(html_str)
def get_shows_html(doc): html = HTML(html=doc) # deduplicate movie names as they could be repeated across the text content = set() for elem in html.find('font[style*="italic"]'): title = elem.text.strip() if title and title != ".": if ',' in title: names = title.split(',') for name in names: name = name.strip() if name: content.add(name) else: content.add(title.strip()) return content
async def main(urlPamar): res = await request(urlPamar) res = HTML(html=res) about = res.find('div.list-item-desc-top a') for i, title in enumerate(about): nextUrl = f'https:{title.attrs["href"]}' nextHtml = await request(nextUrl) print(f'{i + 1} [{title.text}](https:{title.attrs["href"]})') try: startIndex = nextHtml.index(f'"address":') endIndex = nextHtml.index(f',"extraInfos"') print(nextHtml[startIndex:endIndex]) except ValueError: if nextHtml.__contains__("验证中心"): print("需要验证") else: print(nextHtml) finally: pass
def get_news_url_list(pages: int) -> list: '''Gets all news url''' url_news = "/nba/news" r = session.get(url=url_base + url_news, headers=headers) links = [] while pages > 0: html = HTML(html=r.text) news_list_body = html.find('#news_list_body', first=True) links.extend(list(news_list_body.links)) page_link_next = html.find('div > gonext > a[data-id=right]', first=True).attrs['href'] r = session.get(url=url_base + page_link_next, headers=headers) pages += -1 return links
def parse_movies_from_url(url): data = [] headers = [] html_txt = url_to_html(url) r_html = HTML(html=html_txt) r_table = r_html.find('#table') if len(r_table) > 0: parsed_table = r_table[0] rows = parsed_table.find('tr') headers = [header.text for header in rows[0].find('th')] for row in rows[1:]: columns = row.find('td') row_data = [] for column in columns: row_data.append(column.text) data.append(row_data) return (headers, data)
def description_urls(self): pages = self.__find_pages__(INDEX_PAGE_URL) urls = OrderedDict() for index_page in pages: self.__log__.info('Processing page: %s', index_page) r = self.__session__.get(index_page) doc = HTML(html=r.html.find('html', first=True).html) self.__log__.info(' -> start parsing HTML') for t in doc.find('.col_premierecondition'): spans = t.find('span') for s in spans: if __is_class__(s, 'lien') and 'Fi' in s.text: item = s.text.replace('-', '').strip() url = s.find('a', first=True).attrs['href'] self.__log__.info('%s -> %s', item, url) urls[item] = BASE_URL.format(url) return urls
def parse(self, response): # url编码 url = urllib.parse.unquote(response.url).strip() if str(response.url).find("error.html") != -1: # 如果当前页面是空那么直接返回即可 return # 因为是按照view遍历,而返回的是item,所以需要先判断是不是在已经存储的url里面防止重复写入,如果已经抓取过,直接返回 if response.url in self.urlGettedSet: return html = HTML(html=response.text) # 将返回的response转换为request-html能解析的方式 list1 = html.find('.lemmaWgt-subLemmaListTitle') # polysemantList = html.find('.polysemantList-wrapper,cmn-clearfix', first=True) # 如果只是有多义词列表 if list1: lemmaWgtElement = html.find(".custom_dot,para-list,list-paddingleft-1", first=True) urlList = baikeLinkExtractor1(lemmaWgtElement) # 获取同义词连接 for link in urlList: if link not in self.urlGettedSet: req = scrapy.http.request.Request(link, callback=self.parse) yield req else: # 如果有同义词连接,那么就提取所有百科连接进行 print(response) urlList = baikeLinksExtractor(html) for link in urlList: # 从网页中拿到的连接是item所以不再需要判断 if link not in self.urlGettedSet: req = scrapy.http.request.Request(link, callback=self.parse) yield req # 1、需要将当前页面的url和html页面给写入文件 filename = re.sub("[/?&=#.\"'\\:*<>\|]", "_", url.split("/", 4)[-1]) # 将url中的特殊字符给替换为下划线 fitem = FileItem() # 当前程序访问过的url还需要加入已经访问的set吗,实际不需要,因为在一次运行中,不会重复解析,但需要写入文件夹方便下次读出这个 fitem['Name'] = filename + ".txt" fitem['Content'] = str(html.html) # print(str(html.text)) yield fitem urlItem = UrlItem() urlItem['url'] = response.url yield urlItem
def _get_posts(path, pages=10, timeout=5, sleep=0, credentials=None): """Gets posts for a given account.""" global _session, _timeout url = f'{_base_url}/{path}' _session = HTMLSession() _session.headers.update(_headers) if credentials: _login_user(*credentials) _timeout = timeout response = _session.get(url, timeout=_timeout) html = HTML(html=response.html.html.replace('<!--','').replace('-->','')) cursor_blob = html.html while True: for article in html.find('article'): yield _extract_post(article) pages -= 1 if pages == 0: return cursor = _find_cursor(cursor_blob) next_url = f'{_base_url}{cursor}' if sleep: time.sleep(sleep) try: response = _session.get(next_url, timeout=timeout) response.raise_for_status() data = json.loads(response.text.replace('for (;;);', '', 1)) except (RequestException, ValueError): return for action in data['payload']['actions']: if action['cmd'] == 'replace': html = HTML(html=action['html'], url=_base_url) elif action['cmd'] == 'script': cursor_blob = action['code']
def _get_group_posts(path, pages=10, timeout=5, sleep=0, credentials=None, extra_info=False): """Gets posts for a given account.""" global _session, _timeout url = f'{_base_url}/{path}' _session = HTMLSession() _session.headers.update(_headers) if credentials: _login_user(*credentials) _timeout = timeout while True: response = _session.get(url, timeout=_timeout) response.raise_for_status() html = HTML( html=response.html.html.replace('<!--', '').replace('-->', '')) cursor_blob = html.html for article in html.find('article'): post = _extract_post(article) if extra_info: post = fetch_share_and_reactions(post) yield post pages -= 1 if pages <= 0: return cursor = _find_cursor(cursor_blob) if cursor is not None: url = f'{_base_url}{cursor}' if sleep: time.sleep(sleep)
def get_script_sources(url: str, body: str) -> List[str]: """Get script sources Arguments: url {str} -- A URL body {str} -- An HTTP response body Returns: List[str] -- A list of script sources """ html = HTML(html=body) sources: List[str] = [] for script in html.find("script"): source = script.attrs.get("src") if source is not None: sources.append(normalize_source(url, source)) return list(set(sources))
class HTMLVotesParser: def __init__(self, html): self.html = HTML(html=html) self.date = None self.date_votes = None self.topic = None self.kind = None def next_td(self): for tr in self.html.find("tr"): for td in tr.find("td"): classes = td.attrs.get("class", ()) yield td, classes def parse(self) -> VoteList: student = "" dates = [] for td, classes in self.next_td(): text = td.text if self._is_student(classes): student = td.find("span")[2].text if self._is_new_day(classes): self._init_new_day(dates, text) elif self._is_processing_day(): self._process_day(classes, text) if self.date: dates.append((self.date, self.date_votes)) return VoteList(student=student, votes=dates) def _process_day(self, classes, text): if "intestazioni" in classes: if not self.topic: self.topic = text else: self.kind = text elif "voto_" in classes: vote = Vote(self.topic, self.kind, text) self.topic = None self.date_votes.append(vote) def _is_processing_day(self): return self.date is not None def _init_new_day(self, dates, new_date): if self.date: dates.append((self.date, self.date_votes)) self.date = new_date self.date_votes = [] def _is_new_day(self, classes): return "registro" in classes def _is_student(self, classes): return "page-usr-user" in classes
def submit_form(session, response, data=None): from requests_html import HTML html = HTML(url=response.url, html=response.text) forms = html.find('form') if len(forms) == 0: raise Exception('Page does have any forms') form = forms[0] url = form.attrs['action'] fields = form.find('input') data = data or {} for field in fields: name = field.attrs['name'] if name not in data: value = field.attrs['value'] data[name] = value response = session.post(urljoin(response.url, url), data=data) return response
def _weblint_html(path: pathlib.Path, doctype: str) -> set: '''HTML Lint for WebLint. ''' DEPRECATED_TAGS = { 'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame', 'frameset', 'noframes', 'acronym', 'big', 'u', 'isindex', 'basefont', 'dir', 'applet', 'style', } REQUIRED_TAGS = { 'html': ( (('head', '==', 1), 'HS0013'), (('body', '==', 1), 'HS0014'), ), 'head': ( (('title', '==', 1), 'HS0015'), (('meta', '>=', 1), 'HS0018'), (('script', '==', 0), 'HP0001'), ), 'ul': ( (('li', '>=', 1), 'HS0019'), ), 'ol': ( (('li', '>=', 1), 'HS0020'), ), 'select': ( (('option', '>=', 1), 'HS0021'), ), 'dl': ( (('dt', '>=', 1), 'HS0022'), (('dd', '>=', 1), 'HS0023'), ), 'video': ( (('source', '>=', 1), 'HS0024'), ), 'audio': ( (('source', '>=', 1), 'HS0026'), ), 'details': ( (('summary', '==', 1), 'HS0029'), ), 'aside': ( (('main', '==', 0), 'HA0006'), ), 'figure': ( (('figcaption', '==', 1), 'HS0044'), ), } SELFCLOSED_TAGS = { 'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen', 'link', 'meta', 'output', 'param', 'track', 'wbr', 'source', } CLOSE_TAGS = { 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'body', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'fieldset', 'figure', 'figcaption', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'menu', 'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'option', 'optgroup', 'p', 'picture', 'pre', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 'samp', 'script', 'section', 'select', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time', 'title', 'tfoot', 'tr', 'ul', 'var', 'video' } DEPRECATED_ATTRS = { 'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink', 'text', 'background', 'bgcolor', 'border', 'char', 'charoff', 'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules', 'valign', 'accept', 'vspace', } GLOBAL_ATTRS = { 'lang', 'id', 'class', 'title', 'hidden', } VALID_ATTRS = { 'charset', 'name', 'src', 'content', 'controls', 'type', 'href', 'alt', 'rel', 'value', 'min', 'max', } BOOL_ATTRS = { 'controls', 'hidden', } REQUIRED_ATTRS = { 'html': (('lang',), 'HS0012'), 'video': (('controls',), 'HS0027'), 'source': (('src', 'type'), 'HS0025'), 'audio': (('controls',), 'HS0028'), 'a': (('href',), 'HS0031'), 'img': (('src',), 'HS0033'), 'input': (('type',), 'HS0035'), 'link': (('rel', 'href'), 'HS0040'), 'script': (('src',), 'HS0042'), 'progress': (('value', 'max'), 'HS0045'), 'meter': (('value', 'min', 'max'), 'HS0046'), } REQUIRED_ATTRS_ACCESS = { 'img': (('alt',), 'HA0001'), 'a': (('title',), 'HA0007'), } NOEMPTY_TAGS = { ('title', 'HS0016'), ('p', 'HS0017'), ('summary', 'HS0030'), ('a', 'HS0032'), ('video', 'HA0002'), ('audio', 'HA0003'), ('h1', 'HS0036'), ('h2', 'HS0036'), ('h3', 'HS0036'), ('h4', 'HS0036'), ('h5', 'HS0036'), ('h6', 'HS0036'), ('meter', 'HA0008'), } class _StdHTMLParser(HTMLParser): def handle_decl(self, data): self.doctype = data self.not_paired_tags = [] self._start_tags = [] self.duplicated_attrs = [] self.tag_not_lowercase = [] self.empty_tags_not_closed = [] def handle_starttag(self, tag, attrs): # tag name must be in lowercase # Python standard module "html.parser" covert tag name from uppercase # to lowercase already. rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self._start_tags.append(tag) else: self.empty_tags_not_closed.append((tag, self.lineno)) self._handle_attrs(attrs) def handle_endtag(self, tag): if tag == self._start_tags[-1]: self._start_tags.pop() else: if tag not in self._start_tags: self.not_paired_tags.append((tag, self.lineno)) else: for t in reversed(self._start_tags): if t != tag: self.not_paired_tags.append((t, self.lineno)) else: self._start_tags.pop() break def handle_startendtag(self, tag, attrs): # tag name must be in lowercase rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self.not_paired_tags.append((tag, self.lineno)) self._handle_attrs(attrs) def _handle_attrs(self, attrs): attrnames = [a[0] for a in attrs] for a in attrs: name, _ = a # validate duplicated attributes c = attrnames.count(name) if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs: self.duplicated_attrs.append((f'{name} {c}', self.lineno)) def _raw_tag(self): lineno, pos = self.getpos() rawline = self.rawdata.splitlines()[lineno-1] return rawline[pos+1:pos+1+len(self.lasttag)] try: with path.open() as f: doc = f.read() except FileNotFoundError: return {Report('G00001', path, 0, '')} reports = set() # validate DOCTYPE, using standard HTML parser since # requests-html ignore handling the DOCTYPE lineno = 1 obj = 'DOCTYPE' std_parser = _StdHTMLParser() std_parser.feed(doc) try: if std_parser.doctype != doctype: reports.add(Report('HS0002', path, lineno, obj)) return reports rules = { 'not_paired_tags': 'HS0005', 'empty_tags_not_closed': 'HS0006', 'duplicated_attrs': 'HS0009', 'tag_not_lowercase': 'HS0010', } for a, e in rules.items(): # no need to check attr exists, # since doctype has been checked before for t in getattr(std_parser, a): reports.add(Report(e, path, t[1], t[0])) except AttributeError: reports.add(Report('HS0001', path, lineno, obj)) return reports finally: std_parser.close() all_ids = set() parser = HTML(html=doc) for element in parser.find(): lxml_element = element.element tag = lxml_element.tag lineno = lxml_element.sourceline if tag in DEPRECATED_TAGS: reports.add(Report('HS0004', path, lineno, tag)) elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS: reports.add(Report('HS0003', path, lineno, tag)) else: pass # validate required elements rules = REQUIRED_TAGS.get(tag) if rules is not None: for r in rules: if eval(f'not len(element.find(r[0][0])) {r[0][1]} r[0][2]'): reports.add(Report(r[1], path, lineno, r[0][0])) # validate required attributes rules = REQUIRED_ATTRS.get(tag) if rules is not None: for r in rules[0]: if r not in (a.lower() for a in element.attrs): reports.add(Report(rules[1], path, lineno, r)) # validate accessibility attributes rules = REQUIRED_ATTRS_ACCESS.get(tag) if rules is not None: for r in rules[0]: if r not in (a.lower() for a in element.attrs): reports.add(Report(rules[1], path, lineno, r)) # parse attributes for a, v in element.attrs.items(): a_lower = a # validate attribute name must be in lowercase if not a.islower(): reports.add(Report('HS0011', path, lineno, a)) a_lower = a.lower() if a_lower in DEPRECATED_ATTRS: reports.add(Report('HS0008', path, lineno, a)) elif a_lower not in GLOBAL_ATTRS | VALID_ATTRS: reports.add(Report('HS0007', path, lineno, a)) # validate attribute's value is NOT empty if not v and a_lower not in BOOL_ATTRS: reports.add(Report('HS0034', path, lineno, a)) if a_lower == 'id': if v in all_ids: reports.add(Report('HS0037', path, lineno, f'id="{v}"')) all_ids.add(v) for t in NOEMPTY_TAGS: for e in parser.find(t[0]): if not e.text: reports.add(Report(t[1], path, e.element.sourceline, e.element.tag)) # `<h1>` element must be present only once h1_list = parser.find('h1') if len(h1_list) > 1: e = h1_list[-1].element reports.add(Report('HA0004', path, e.sourceline, e.tag)) # <main> element without "hidden" attribute must be present only once main_list = parser.find('main') main_count = len(main_list) main_hidden_count = len(parser.find('main[hidden]')) if main_count - main_hidden_count != 1: for e in main_list: reports.add(Report('HS0038', path, e.element.sourceline, 'main')) # <meta> element with "charset" attribute must be present only once meta_charset_list = parser.find('meta[charset]') meta_charset_count = len(meta_charset_list) if not meta_charset_count: reports.add(Report('HS0018', path, 0, 'meta charset')) elif meta_charset_count > 1: for e in meta_charset_list: obj = f'meta charset {meta_charset_count}' reports.add(Report('HS0009', path, e.element.sourceline, obj)) # <input> element with "type=image" must have "src" and "alt" atrributes for e in parser.find('input[type="image"]'): if 'src' not in e.attrs: reports.add(Report('HS0039', path, e.element.sourceline, 'src')) if 'alt' not in e.attrs: reports.add(Report('HA0005', path, e.element.sourceline, 'alt')) # <link> element must **NOT** have `type` attribute with value of `text/css` for e in parser.find('link[rel="stylesheet"]'): assert 'href' in e.attrs if e.attrs['href'].endswith('css'): if 'type' in e.attrs and e.attrs['type'] == 'text/css': l = e.element.sourceline reports.add(Report('HS0041', path, l, 'type')) # <script> element must **NOT** have `type` attribute with value of `text/javascript` for e in parser.find('script[type="text/javascript"]'): l = e.element.sourceline reports.add(Report('HS0043', path, l, 'type')) return reports