def __parse_profile(self, page): try: html = HTML(html=page.text, url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{self.username}" does not exist or is private.' ) except ParserError: pass # TODO: Check what kind of exception raising if no location self.location = html.find('.ProfileHeaderCard-locationText')[0].text # TODO: Check what kind of exception raising if no location self.birthday = html.find('.ProfileHeaderCard-birthdateText')[0].text if self.birthday: self.birthday = self.birthday.replace('Born ', '') else: self.birthday = None self.profile_photo = html.find('.ProfileAvatar-image')[0].attrs['src'] page_title = html.find('title')[0].text self.name = page_title[:page_title.find('(')].strip() self.biography = html.find('.ProfileHeaderCard-bio')[0].text self.website = html.find('.ProfileHeaderCard-urlText')[0].text # scrape profile stats _stats_source = html.find("span[class=ProfileNav-value]") # get total tweets count if available try: q = _stats_source[0].attrs['data-count'] self.tweets_count = int(q) except: self.tweets_count = None # get total following count if available try: q = _stats_source[1].attrs['data-count'] self.following_count = int(q) except: self.following_count = None # get total follower count if available try: q = _stats_source[2].attrs['data-count'] self.followers_count = int(q) except: self.followers_count = None # get total like count if available try: q = _stats_source[3].attrs['data-count'] self.likes_count = int(q) except: self.likes_count = None
def vquest(config, collapse=True): """Submit a request to V-QUEST. config should be a dictionary key/value pairs to use in the request. See data/options.yml for a full list, organized into sections. Currently resultType must be "excel" and xv_outputtype must be 3 (for "Download AIRR formatted results"). sequences are batched into sets of 50 (the most allowed by V-QUEST) and submitted one batch at a time. If collapse is True, results are combined as though they were submitted and processed as a single request, and a dictionary of file names to text contents is returned. If collapse is False, a list of dictionaries is returned, one for each batch, storing raw byte contents. """ if not all([ config.get("species"), config.get("receptorOrLocusType"), config.get("fileSequences") or config.get("sequences") ]): raise ValueError("species, receptorOrLocusType, and fileSequences " "and/or sequences are required options") supported = [("resultType", "excel"), ("xv_outputtype", 3)] if all([config.get(pair[0]) == pair[1] for pair in supported]): outputs = [] records = _parse_records(config) if not records: raise ValueError("No sequences supplied") LOGGER.info("Starting request batch for %d sequences total", len(records)) for chunk in chunker(records, CHUNK_SIZE): if outputs: time.sleep(DELAY) LOGGER.info("Sending request with %d sequences...", len(chunk)) out_handle = StringIO() SeqIO.write(chunk, out_handle, "fasta") config_chunk = config.copy() config_chunk["sequences"] = out_handle.getvalue() config_chunk["inputType"] = "inline" response = requests.post(URL, data=config_chunk) ctype = response.headers.get("Content-Type") LOGGER.debug("Received data of type %s", ctype) if ctype and "text/html" in ctype: html = HTML(html=response.content) errors = [div.text for div in html.find("div.form_error")] if errors: raise VquestError("; ".join(errors), errors) outputs.append(unzip(response.content)) if not collapse: return outputs return _collapse_outputs(outputs) needed = " ".join([pair[0] + "=" + str(pair[1]) for pair in supported]) observed = " ".join( [pair[0] + "=" + str(config.get(pair[0])) for pair in supported]) raise NotImplementedError( ("Only " + needed + " currently supported, not " + observed))
def parse_review_html(html_text): r_html = HTML(html=html_text) review_html = r_html.find(".lenderReviews") if review_html == -1: return parse_html_error review_body = review_html[0] reviews = review_body.find(".mainReviews") if reviews == -1: return parse_html_error return reviews
def parse(self, response): # url编码 url = urllib.parse.unquote(response.url).strip() if str(response.url).find("error.html") != -1: # 如果当前页面是空那么直接返回即可 return # 因为是按照view遍历,而返回的是item,所以需要先判断是不是在已经存储的url里面防止重复写入,如果已经抓取过,直接返回 if response.url in self.urlGettedSet: return html = HTML(html=response.text) # 将返回的response转换为request-html能解析的方式 list1 = html.find('.lemmaWgt-subLemmaListTitle') # polysemantList = html.find('.polysemantList-wrapper,cmn-clearfix', first=True) # 如果只是有多义词列表 if list1: lemmaWgtElement = html.find(".custom_dot,para-list,list-paddingleft-1", first=True) urlList = baikeLinkExtractor1(lemmaWgtElement) # 获取同义词连接 for link in urlList: if link not in self.urlGettedSet: req = scrapy.http.request.Request(link, callback=self.parse) yield req else: # 如果有同义词连接,那么就提取所有百科连接进行 print(response) urlList = baikeLinksExtractor(html) for link in urlList: # 从网页中拿到的连接是item所以不再需要判断 if link not in self.urlGettedSet: req = scrapy.http.request.Request(link, callback=self.parse) yield req # 1、需要将当前页面的url和html页面给写入文件 filename = re.sub("[/?&=#.\"'\\:*<>\|]", "_", url.split("/", 4)[-1]) # 将url中的特殊字符给替换为下划线 fitem = FileItem() # 当前程序访问过的url还需要加入已经访问的set吗,实际不需要,因为在一次运行中,不会重复解析,但需要写入文件夹方便下次读出这个 fitem['Name'] = filename + ".txt" fitem['Content'] = str(html.html) # print(str(html.text)) yield fitem urlItem = UrlItem() urlItem['url'] = response.url yield urlItem
def get_news_url_list(pages: int) -> list: '''Gets all news url''' url_news = "/nba/news" r = session.get(url=url_base + url_news, headers=headers) links = [] while pages > 0: html = HTML(html=r.text) news_list_body = html.find('#news_list_body', first=True) links.extend(list(news_list_body.links)) page_link_next = html.find('div > gonext > a[data-id=right]', first=True).attrs['href'] r = session.get(url=url_base + page_link_next, headers=headers) pages += -1 return links
def parse(data_object): # get the html data html_data = HTML(html=data_object) # get the table with contain the data css_class = ".result-count" result_count = html_data.find(css_class) total_forclosures = (re.findall('\d+', result_count[0].text))[0] print(total_forclosures) return total_forclosures
def get_script_sources(url: str, body: str) -> List[str]: html = HTML(html=body) sources: List[str] = [] for script in html.find("script"): source = script.attrs.get("src") if source is not None: sources.append(normalize_source(url, source)) return list(set(sources))
def get_push_data_from_article_data_date(url): #2 resp = fetch(url) html = HTML(html=resp.text) post_entries = html.find('div.push') data = [] for entry in post_entries: check = parse_push_data(entry) if check != None: data.append(check) return data
def parse_product_list(self, html): html = HTML(html=html) tbody_list = html.find( '#main > div.mycomment-bd > div.mycomment-table > table > tbody') lists = [] for _tbody in tbody_list: _tbody = HTML(html=_tbody.html) product_url = furl( _tbody.find( 'tr.tr-bd > td:nth-child(1) > div.goods-item > div.p-msg > div > a' )[0].attrs['href']) order_id = _tbody.find('tr.tr-th > td > span.number > a')[0].text lists.append({ 'product_id': int(str(product_url.path).strip('/').strip('.html')), 'order_id': order_id }) return lists
def check_login(self): ''' 检测登录 ''' result = self.session.get(self.HOME_URL, verify=False) html = HTML(html=result.text) if html.find('title', first=True).text == '我的京东': return True else: return False
def fetch_image_links(url): response = requests.get(url) response = requests.get(url, cookies={'over18': '1'}) html = HTML(html=response.text) content_entries = html.find('a') #藉由print(response.text)可以發現圖片連結都在Element 'a'的樹裡,而且不屬於別的Element底下 img_urls = [] for content in content_entries: if re.match(r'^https?://(i.)?(m.)?imgur.com', content.attrs['href']): img_urls.append(content.attrs['href']) return img_urls
def scrape_data(url): # Open the url in the browser(hidden) using selenium browser.get(url) # Timeout between each query to prevent errors and amazon might block account if too fast time.sleep(2) # Get all the HTML data from the website,i.e inside the <body> tag html_data = browser.find_element_by_css_selector('body').get_attribute( "innerHTML") # Convert the data to usable data. HTML() is imported from requests_html html_str = HTML(html=html_data) avbl = html_str.find("#availability")[0].text title_el = html_str.find('#productTitle')[0].text #This will contain all the data for the particular item data_el = [] if not avbl.startswith("Currently"): price_el = html_str.find('#priceblock_ourprice')[0].text try: saving_el = html_str.find("#regularprice_savings")[0].text.split( '(')[1].split(')')[0] except: saving_el = "0%" data_el.append({ "Title": title_el, "Price": price_el, "Available": avbl, "Saving": saving_el }) else: avbl = avbl.split('.')[0] data_el.append({ "Title": title_el, "Price": "NA", "Available": avbl, "Saving": "NA" }) return data_el
async def get_tickets(page, session_params): s, webidx, terminal = session_params url = URL.format(terminal, webidx, page) async with s.get(url) as resp: h = await resp.text() html = HTML(html=h) table = html.find('table')[1].html df = pd.read_html(table, header=2)[0].dropna( axis=1, how='all').assign(terminal=terminal) return df
def get_trends(proxies=None): session = HTMLSession() html = session.get("https://twitter.com/i/trends", headers=get_headers(), proxies=proxies) html = html.json()["module_html"] html = HTML(html=html, url="bunk", default_encoding="utf-8") for trend_item in html.find("li"): trend_text = trend_item.attrs["data-trend-name"] yield trend_text
class HTMLVotesParser: def __init__(self, html): self.html = HTML(html=html) self.date = None self.date_votes = None self.topic = None self.kind = None def next_td(self): for tr in self.html.find("tr"): for td in tr.find("td"): classes = td.attrs.get("class", ()) yield td, classes def parse(self) -> VoteList: student = "" dates = [] for td, classes in self.next_td(): text = td.text if self._is_student(classes): student = td.find("span")[2].text if self._is_new_day(classes): self._init_new_day(dates, text) elif self._is_processing_day(): self._process_day(classes, text) if self.date: dates.append((self.date, self.date_votes)) return VoteList(student=student, votes=dates) def _process_day(self, classes, text): if "intestazioni" in classes: if not self.topic: self.topic = text else: self.kind = text elif "voto_" in classes: vote = Vote(self.topic, self.kind, text) self.topic = None self.date_votes.append(vote) def _is_processing_day(self): return self.date is not None def _init_new_day(self, dates, new_date): if self.date: dates.append((self.date, self.date_votes)) self.date = new_date self.date_votes = [] def _is_new_day(self, classes): return "registro" in classes def _is_student(self, classes): return "page-usr-user" in classes
def parse_post_entries(doc): html = HTML(html=doc) post_entries = html.find('#main-content', first=True).text post_content = post_entries.split('※ 發信站: 批踢踢實業坊(ptt.cc)')[0] post_content = post_content.split('\n') if (len(post_content) == 5): content = post_content[4] else: content = post_content[5] return content
def get_token(content): """Method to get the token. Args: content (str): text content of the html request. Returns: token (str): token extracted from html content. """ html = HTML(html=content) token = html.find('input', first=True).attrs.get('value') return token
def gen_tweets(cv_url,r): try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{user}" does not exist or is private.') comma = "," dot = "." tweets = [] for tweet in html.find('.stream-item'): twwtext = tweet.find('.tweet-text') if len(twwtext)>0: text = twwtext[0].full_text else: continue tweetId = tweet.find( '.js-permalink')[0].attrs['data-conversation-id'] href = tweet.find( '.js-permalink')[0].attrs['href'] tweetFrom = href.split('/')[1] time = datetime.fromtimestamp( int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0 + 8*60*60)#加8h使其显示cn时间 interactions = [x.text for x in tweet.find( '.ProfileTweet-actionCount')] replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,"")) retweets = int(interactions[1].split(" ")[ 0].replace(comma, "").replace(dot,"")) likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,"")) hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')] urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')] photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')] videos = [] video_nodes = tweet.find(".PlayableMedia-player") for node in video_nodes: styles = node.attrs['style'].split() for style in styles: if style.startswith('background'): tmp = style.split('/')[-1] video_id = tmp[:tmp.index('.jpg')] videos.append({'id': video_id}) tweets.append({'tweetId': tweetId, 'time': time, 'text': text,'cv_url':cv_url, 'replies': replies, 'retweets': retweets, 'likes': likes, 'isRetweet':tweetFrom == cv_url, 'entries': { 'hashtags': hashtags, 'urls': urls, 'photos': photos, 'videos': videos } }) return tweets
def scrape_product_page(url, title_lookup="#productTitle", price_lookup="#priceblock_ourprice"): driver.get(url) time.sleep(3) body = driver.find_element_by_css_selector('body') html_str = body.get_attribute('innerHTML') html_obj = HTML(html=html_str) product_title = html_obj.find(title_lookup, first=True).text product_price = html_obj.find(price_lookup, first=True).text return product_title, product_price
def gen_tweets(pages): r = session.get(url, headers=headers) while pages > 0: try: html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8') except KeyError: raise ValueError( f'Oops! Either "{user}" does not exist or private.') tweets = [tweet.full_text for tweet in html.find('.tweet-text')] last_tweet = html.find('.stream-item')[-1].attrs['data-item-id'] for tweet in tweets: if tweet: yield re.sub('http', ' http', tweet, 1) r = session.get( url, params={'max_position': last_tweet}, headers=headers) pages += -1
def parse(html: HTML) -> List[Metric]: """Scrape metrics tiles from page.""" stats_cards = html.find(".stats-cards__container", first=True) tiles = stats_cards.find(".stats-cards__item") metrics = [ Metric( label=tile.find(".stats-cards__label", first=True).text, value=tile.find(".stats-cards__number", first=True).text, ) for tile in tiles ] return metrics
def get_board_list(self, resp): boards = [] html = HTML(html=resp.text) board_entries = html.find('div.b-ent') for idx, entry in enumerate(board_entries): meta = self.parser_boardlist_meta(entry) pretty_print_board(idx, meta['name'], meta['class'], meta['title']) boards.append(meta['name']) return boards
def main(): resp = fetch(url=url) if resp.status_code == 200: html = HTML(html=resp.text) post_entries = html.find('div.r-ent') for entry in post_entries: meta = parser_article_meta(entry) print(meta) else: print(resp.status_code)
def parse(text): html = HTML(html=text) profile_url = html.find("a.user_avatar", first=True) if not profile_url: logging.error("Can not parse backend response - no a.user_avatar") raise UnknownBackendResponse() try: return profile_url.attrs["href"] except KeyError: logging.exception("Can not parse backend response") return UnknownBackendResponse()
async def _get_city(ip): url = 'http://www.ip138.com/ips138.asp' async with aiohttp.ClientSession() as session: # async with async_timeout.timeout(10): async with session.get(url, params={"ip": ip}) as response: txt = await response.text() html = HTML(html=txt) html.encoding = "utf-8" lis = html.find('li') city = lis[0].text.split(":")[-1].split(" ")[0] return city
def html_parser(html: HTML) -> Dict: """Parses HTML element into individual sections Given an html element the html_parser will search for each profile section using CSS selectors. All parsed html elements are gathered into a dictionary and returned. Args: html: HTML element from a successful nitter profile scraped response. Returns: A dictionary of found elements from the parsed sections. """ elements = {} elements["username"] = html.find(".profile-card-username", first=True) elements["name"] = html.find(".profile-card-fullname", first=True) elements["biography"] = html.find(".profile-bio", first=True) elements["location"] = html.find(".profile-location", first=True) elements["is_verified"] = html.find( ".profile-card-fullname .icon-container .verified-icon", first=True ) elements["profile_photo"] = html.find(".profile-card-avatar", first=True) elements["banner_photo"] = html.find(".profile-banner a", first=True) elements["website"] = html.find(".profile-website", first=True) profile_statlist = html.find(".profile-statlist", first=True) elements["tweets_count"] = profile_statlist.find(".posts .profile-stat-num", first=True) elements["following_count"] = profile_statlist.find(".following .profile-stat-num", first=True) elements["followers_count"] = profile_statlist.find(".followers .profile-stat-num", first=True) elements["likes_count"] = profile_statlist.find(".likes .profile-stat-num", first=True) elements = {k: v for k, v in elements.items() if v is not None} return elements
def parse(text, user_profile_url): html = HTML(html=text) # find persona_name div = html.find("div.profile_header_centered_persona", first=True) if not div: fallback_div = html.find("div.welcome_header_ctn") if fallback_div: logger.info("Fresh account without set up steam profile.") raise UnfinishedAccountSetup() logger.error( "Can not parse backend response - no div.profile_header_centered_persona" ) raise UnknownBackendResponse() span = div.find("span.actual_persona_name", first=True) if not span: logger.error( "Can not parse backend response - no span.actual_persona_name" ) raise UnknownBackendResponse() persona_name = span.text # find steam id variable = 'g_steamID = "' start = text.find(variable) if start == -1: logger.error( "Can not parse backend response - no g_steamID variable") raise UnknownBackendResponse() start += len(variable) end = text.find('";', start) steam_id = text[start:end] # find miniprofile id profile_link = f'{user_profile_url}" data-miniprofile="' start = text.find(profile_link) if start == -1: logger.error( "Can not parse backend response - no steam profile href") raise UnknownBackendResponse() return steam_id, persona_name
def process_page(text): html = HTML(html=text) item_css = '#content ol.grid_view > li' items = html.find(item_css) rank_css = 'em' title_css = '.info span.title' score_css = '.info .rating_num' for item in items: rank = int(item.find(rank_css, first=True).text) title = item.find(title_css, first=True).text score = float(item.find(score_css, first=True).text) movies_250.append(Movie(rank, score, title))
def get_page_count(): """ Get the total number of pages (as integer) from the main "Auction History" page: """ time.sleep(1) main_r = requests.get(root_url, headers=request_headers) if main_r.status_code == 200: main_r_html = HTML(html=main_r.text) page_numbers = main_r_html.find(".PageLink") main_r.close() max_page = int(list(page_numbers[-1].links)[0].split("page=")[-1]) return max_page else: return 0
def parse(self, html: HTML) -> [ProxyIP]: ip_list: [ProxyIP] = [] for ip_row in html.find('.proxylist tbody tr'): ip_port = ip_row.find('td:nth-child(1)', first=True).text ip_address, port = ip_port.split(":") p = ProxyIP(ip=ip_address, port=port) ip_list.append(p) return ip_list
def test_bare_render(): doc = """<a href='https://httpbin.org'>""" html = HTML(html=doc) script = """ () => { return { width: document.documentElement.clientWidth, height: document.documentElement.clientHeight, deviceScaleFactor: window.devicePixelRatio, } } """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): assert value in val assert html.find('html') assert 'https://httpbin.org' in html.links
def test_bare_js_eval(): doc = """ <!DOCTYPE html> <html> <body> <div id="replace">This gets replaced</div> <script type="text/javascript"> document.getElementById("replace").innerHTML = "yolo"; </script> </body> </html> """ html = HTML(html=doc) html.render() assert html.find('#replace', first=True).text == 'yolo'
def submit_form(session, response, data=None): from requests_html import HTML html = HTML(url=response.url, html=response.text) forms = html.find('form') if len(forms) == 0: raise Exception('Page does have any forms') form = forms[0] url = form.attrs['action'] fields = form.find('input') data = data or {} for field in fields: name = field.attrs['name'] if name not in data: value = field.attrs['value'] data[name] = value response = session.post(urljoin(response.url, url), data=data) return response
def _weblint_html(path: pathlib.Path, doctype: str) -> set: '''HTML Lint for WebLint. ''' DEPRECATED_TAGS = { 'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame', 'frameset', 'noframes', 'acronym', 'big', 'u', 'isindex', 'basefont', 'dir', 'applet', 'style', } REQUIRED_TAGS = { 'html': ( (('head', '==', 1), 'HS0013'), (('body', '==', 1), 'HS0014'), ), 'head': ( (('title', '==', 1), 'HS0015'), (('meta', '>=', 1), 'HS0018'), (('script', '==', 0), 'HP0001'), ), 'ul': ( (('li', '>=', 1), 'HS0019'), ), 'ol': ( (('li', '>=', 1), 'HS0020'), ), 'select': ( (('option', '>=', 1), 'HS0021'), ), 'dl': ( (('dt', '>=', 1), 'HS0022'), (('dd', '>=', 1), 'HS0023'), ), 'video': ( (('source', '>=', 1), 'HS0024'), ), 'audio': ( (('source', '>=', 1), 'HS0026'), ), 'details': ( (('summary', '==', 1), 'HS0029'), ), 'aside': ( (('main', '==', 0), 'HA0006'), ), 'figure': ( (('figcaption', '==', 1), 'HS0044'), ), } SELFCLOSED_TAGS = { 'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen', 'link', 'meta', 'output', 'param', 'track', 'wbr', 'source', } CLOSE_TAGS = { 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'bdi', 'bdo', 'blockquote', 'body', 'button', 'canvas', 'caption', 'cite', 'code', 'col', 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div', 'dl', 'dt', 'em', 'fieldset', 'figure', 'figcaption', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html', 'ins', 'kbd', 'label', 'legend', 'li', 'main', 'map', 'menu', 'menuitem', 'meter', 'nav', 'noscript', 'object', 'ol', 'option', 'optgroup', 'p', 'picture', 'pre', 'progress', 'q', 'rb', 'rp', 'rt', 'rtc', 'ruby', 'samp', 'script', 'section', 'select', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time', 'title', 'tfoot', 'tr', 'ul', 'var', 'video' } DEPRECATED_ATTRS = { 'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink', 'text', 'background', 'bgcolor', 'border', 'char', 'charoff', 'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules', 'valign', 'accept', 'vspace', } GLOBAL_ATTRS = { 'lang', 'id', 'class', 'title', 'hidden', } VALID_ATTRS = { 'charset', 'name', 'src', 'content', 'controls', 'type', 'href', 'alt', 'rel', 'value', 'min', 'max', } BOOL_ATTRS = { 'controls', 'hidden', } REQUIRED_ATTRS = { 'html': (('lang',), 'HS0012'), 'video': (('controls',), 'HS0027'), 'source': (('src', 'type'), 'HS0025'), 'audio': (('controls',), 'HS0028'), 'a': (('href',), 'HS0031'), 'img': (('src',), 'HS0033'), 'input': (('type',), 'HS0035'), 'link': (('rel', 'href'), 'HS0040'), 'script': (('src',), 'HS0042'), 'progress': (('value', 'max'), 'HS0045'), 'meter': (('value', 'min', 'max'), 'HS0046'), } REQUIRED_ATTRS_ACCESS = { 'img': (('alt',), 'HA0001'), 'a': (('title',), 'HA0007'), } NOEMPTY_TAGS = { ('title', 'HS0016'), ('p', 'HS0017'), ('summary', 'HS0030'), ('a', 'HS0032'), ('video', 'HA0002'), ('audio', 'HA0003'), ('h1', 'HS0036'), ('h2', 'HS0036'), ('h3', 'HS0036'), ('h4', 'HS0036'), ('h5', 'HS0036'), ('h6', 'HS0036'), ('meter', 'HA0008'), } class _StdHTMLParser(HTMLParser): def handle_decl(self, data): self.doctype = data self.not_paired_tags = [] self._start_tags = [] self.duplicated_attrs = [] self.tag_not_lowercase = [] self.empty_tags_not_closed = [] def handle_starttag(self, tag, attrs): # tag name must be in lowercase # Python standard module "html.parser" covert tag name from uppercase # to lowercase already. rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self._start_tags.append(tag) else: self.empty_tags_not_closed.append((tag, self.lineno)) self._handle_attrs(attrs) def handle_endtag(self, tag): if tag == self._start_tags[-1]: self._start_tags.pop() else: if tag not in self._start_tags: self.not_paired_tags.append((tag, self.lineno)) else: for t in reversed(self._start_tags): if t != tag: self.not_paired_tags.append((t, self.lineno)) else: self._start_tags.pop() break def handle_startendtag(self, tag, attrs): # tag name must be in lowercase rawtag = self._raw_tag() if not rawtag.islower(): self.tag_not_lowercase.append((rawtag, self.lineno)) if tag not in SELFCLOSED_TAGS: self.not_paired_tags.append((tag, self.lineno)) self._handle_attrs(attrs) def _handle_attrs(self, attrs): attrnames = [a[0] for a in attrs] for a in attrs: name, _ = a # validate duplicated attributes c = attrnames.count(name) if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs: self.duplicated_attrs.append((f'{name} {c}', self.lineno)) def _raw_tag(self): lineno, pos = self.getpos() rawline = self.rawdata.splitlines()[lineno-1] return rawline[pos+1:pos+1+len(self.lasttag)] try: with path.open() as f: doc = f.read() except FileNotFoundError: return {Report('G00001', path, 0, '')} reports = set() # validate DOCTYPE, using standard HTML parser since # requests-html ignore handling the DOCTYPE lineno = 1 obj = 'DOCTYPE' std_parser = _StdHTMLParser() std_parser.feed(doc) try: if std_parser.doctype != doctype: reports.add(Report('HS0002', path, lineno, obj)) return reports rules = { 'not_paired_tags': 'HS0005', 'empty_tags_not_closed': 'HS0006', 'duplicated_attrs': 'HS0009', 'tag_not_lowercase': 'HS0010', } for a, e in rules.items(): # no need to check attr exists, # since doctype has been checked before for t in getattr(std_parser, a): reports.add(Report(e, path, t[1], t[0])) except AttributeError: reports.add(Report('HS0001', path, lineno, obj)) return reports finally: std_parser.close() all_ids = set() parser = HTML(html=doc) for element in parser.find(): lxml_element = element.element tag = lxml_element.tag lineno = lxml_element.sourceline if tag in DEPRECATED_TAGS: reports.add(Report('HS0004', path, lineno, tag)) elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS: reports.add(Report('HS0003', path, lineno, tag)) else: pass # validate required elements rules = REQUIRED_TAGS.get(tag) if rules is not None: for r in rules: if eval(f'not len(element.find(r[0][0])) {r[0][1]} r[0][2]'): reports.add(Report(r[1], path, lineno, r[0][0])) # validate required attributes rules = REQUIRED_ATTRS.get(tag) if rules is not None: for r in rules[0]: if r not in (a.lower() for a in element.attrs): reports.add(Report(rules[1], path, lineno, r)) # validate accessibility attributes rules = REQUIRED_ATTRS_ACCESS.get(tag) if rules is not None: for r in rules[0]: if r not in (a.lower() for a in element.attrs): reports.add(Report(rules[1], path, lineno, r)) # parse attributes for a, v in element.attrs.items(): a_lower = a # validate attribute name must be in lowercase if not a.islower(): reports.add(Report('HS0011', path, lineno, a)) a_lower = a.lower() if a_lower in DEPRECATED_ATTRS: reports.add(Report('HS0008', path, lineno, a)) elif a_lower not in GLOBAL_ATTRS | VALID_ATTRS: reports.add(Report('HS0007', path, lineno, a)) # validate attribute's value is NOT empty if not v and a_lower not in BOOL_ATTRS: reports.add(Report('HS0034', path, lineno, a)) if a_lower == 'id': if v in all_ids: reports.add(Report('HS0037', path, lineno, f'id="{v}"')) all_ids.add(v) for t in NOEMPTY_TAGS: for e in parser.find(t[0]): if not e.text: reports.add(Report(t[1], path, e.element.sourceline, e.element.tag)) # `<h1>` element must be present only once h1_list = parser.find('h1') if len(h1_list) > 1: e = h1_list[-1].element reports.add(Report('HA0004', path, e.sourceline, e.tag)) # <main> element without "hidden" attribute must be present only once main_list = parser.find('main') main_count = len(main_list) main_hidden_count = len(parser.find('main[hidden]')) if main_count - main_hidden_count != 1: for e in main_list: reports.add(Report('HS0038', path, e.element.sourceline, 'main')) # <meta> element with "charset" attribute must be present only once meta_charset_list = parser.find('meta[charset]') meta_charset_count = len(meta_charset_list) if not meta_charset_count: reports.add(Report('HS0018', path, 0, 'meta charset')) elif meta_charset_count > 1: for e in meta_charset_list: obj = f'meta charset {meta_charset_count}' reports.add(Report('HS0009', path, e.element.sourceline, obj)) # <input> element with "type=image" must have "src" and "alt" atrributes for e in parser.find('input[type="image"]'): if 'src' not in e.attrs: reports.add(Report('HS0039', path, e.element.sourceline, 'src')) if 'alt' not in e.attrs: reports.add(Report('HA0005', path, e.element.sourceline, 'alt')) # <link> element must **NOT** have `type` attribute with value of `text/css` for e in parser.find('link[rel="stylesheet"]'): assert 'href' in e.attrs if e.attrs['href'].endswith('css'): if 'type' in e.attrs and e.attrs['type'] == 'text/css': l = e.element.sourceline reports.add(Report('HS0041', path, l, 'type')) # <script> element must **NOT** have `type` attribute with value of `text/javascript` for e in parser.find('script[type="text/javascript"]'): l = e.element.sourceline reports.add(Report('HS0043', path, l, 'type')) return reports