示例#1
0
    def __parse_profile(self, page):
        try:
            html = HTML(html=page.text, url='bunk', default_encoding='utf-8')
        except KeyError:
            raise ValueError(
                f'Oops! Either "{self.username}" does not exist or is private.'
            )
        except ParserError:
            pass

        # TODO: Check what kind of exception raising if no location
        self.location = html.find('.ProfileHeaderCard-locationText')[0].text

        # TODO: Check what kind of exception raising if no location
        self.birthday = html.find('.ProfileHeaderCard-birthdateText')[0].text
        if self.birthday:
            self.birthday = self.birthday.replace('Born ', '')
        else:
            self.birthday = None

        self.profile_photo = html.find('.ProfileAvatar-image')[0].attrs['src']

        page_title = html.find('title')[0].text
        self.name = page_title[:page_title.find('(')].strip()

        self.biography = html.find('.ProfileHeaderCard-bio')[0].text

        self.website = html.find('.ProfileHeaderCard-urlText')[0].text

        # scrape profile stats
        _stats_source = html.find("span[class=ProfileNav-value]")

        # get total tweets count if available
        try:
            q = _stats_source[0].attrs['data-count']
            self.tweets_count = int(q)
        except:
            self.tweets_count = None

        # get total following count if available
        try:
            q = _stats_source[1].attrs['data-count']
            self.following_count = int(q)
        except:
            self.following_count = None

        # get total follower count if available
        try:
            q = _stats_source[2].attrs['data-count']
            self.followers_count = int(q)
        except:
            self.followers_count = None

        # get total like count if available
        try:
            q = _stats_source[3].attrs['data-count']
            self.likes_count = int(q)
        except:
            self.likes_count = None
示例#2
0
文件: request.py 项目: ressy/vquest
def vquest(config, collapse=True):
    """Submit a request to V-QUEST.

    config should be a dictionary key/value pairs to use in the request.  See
    data/options.yml for a full list, organized into sections.  Currently
    resultType must be "excel" and xv_outputtype must be 3 (for "Download AIRR
    formatted results").

    sequences are batched into sets of 50 (the most allowed by V-QUEST) and
    submitted one batch at a time.  If collapse is True, results are combined
    as though they were submitted and processed as a single request, and a
    dictionary of file names to text contents is returned.  If collapse is
    False, a list of dictionaries is returned, one for each batch, storing raw
    byte contents.
    """
    if not all([
            config.get("species"),
            config.get("receptorOrLocusType"),
            config.get("fileSequences") or config.get("sequences")
    ]):
        raise ValueError("species, receptorOrLocusType, and fileSequences "
                         "and/or sequences are required options")
    supported = [("resultType", "excel"), ("xv_outputtype", 3)]
    if all([config.get(pair[0]) == pair[1] for pair in supported]):
        outputs = []
        records = _parse_records(config)
        if not records:
            raise ValueError("No sequences supplied")
        LOGGER.info("Starting request batch for %d sequences total",
                    len(records))
        for chunk in chunker(records, CHUNK_SIZE):
            if outputs:
                time.sleep(DELAY)
            LOGGER.info("Sending request with %d sequences...", len(chunk))
            out_handle = StringIO()
            SeqIO.write(chunk, out_handle, "fasta")
            config_chunk = config.copy()
            config_chunk["sequences"] = out_handle.getvalue()
            config_chunk["inputType"] = "inline"
            response = requests.post(URL, data=config_chunk)
            ctype = response.headers.get("Content-Type")
            LOGGER.debug("Received data of type %s", ctype)
            if ctype and "text/html" in ctype:
                html = HTML(html=response.content)
                errors = [div.text for div in html.find("div.form_error")]
                if errors:
                    raise VquestError("; ".join(errors), errors)
            outputs.append(unzip(response.content))
        if not collapse:
            return outputs
        return _collapse_outputs(outputs)
    needed = " ".join([pair[0] + "=" + str(pair[1]) for pair in supported])
    observed = " ".join(
        [pair[0] + "=" + str(config.get(pair[0])) for pair in supported])
    raise NotImplementedError(
        ("Only " + needed + " currently supported, not " + observed))
示例#3
0
def parse_review_html(html_text):
    r_html = HTML(html=html_text)
    review_html = r_html.find(".lenderReviews")
    if review_html == -1:
        return parse_html_error
    review_body = review_html[0]
    reviews = review_body.find(".mainReviews")
    if reviews == -1:
        return parse_html_error
    return reviews
示例#4
0
    def parse(self, response):

        # url编码
        url = urllib.parse.unquote(response.url).strip()

        if str(response.url).find("error.html") != -1:  # 如果当前页面是空那么直接返回即可
            return
        # 因为是按照view遍历,而返回的是item,所以需要先判断是不是在已经存储的url里面防止重复写入,如果已经抓取过,直接返回
        if response.url in self.urlGettedSet:
            return

        html = HTML(html=response.text)  # 将返回的response转换为request-html能解析的方式
        list1 = html.find('.lemmaWgt-subLemmaListTitle')
        # polysemantList = html.find('.polysemantList-wrapper,cmn-clearfix', first=True)
        # 如果只是有多义词列表
        if list1:
            lemmaWgtElement = html.find(".custom_dot,para-list,list-paddingleft-1", first=True)
            urlList = baikeLinkExtractor1(lemmaWgtElement)  # 获取同义词连接
            for link in urlList:
                if link not in self.urlGettedSet:
                    req = scrapy.http.request.Request(link, callback=self.parse)
                    yield req
        else:
            # 如果有同义词连接,那么就提取所有百科连接进行
            print(response)
            urlList = baikeLinksExtractor(html)
            for link in urlList:
                # 从网页中拿到的连接是item所以不再需要判断
                if link not in self.urlGettedSet:
                    req = scrapy.http.request.Request(link, callback=self.parse)
                    yield req
                    # 1、需要将当前页面的url和html页面给写入文件
            filename = re.sub("[/?&=#.\"'\\:*<>\|]", "_", url.split("/", 4)[-1])  # 将url中的特殊字符给替换为下划线
            fitem = FileItem()
            # 当前程序访问过的url还需要加入已经访问的set吗,实际不需要,因为在一次运行中,不会重复解析,但需要写入文件夹方便下次读出这个
            fitem['Name'] = filename + ".txt"
            fitem['Content'] = str(html.html)
            # print(str(html.text))
            yield fitem

            urlItem = UrlItem()
            urlItem['url'] = response.url
            yield urlItem
示例#5
0
    def get_news_url_list(pages: int) -> list:
        '''Gets all news url'''

        url_news = "/nba/news"
        r = session.get(url=url_base + url_news, headers=headers)

        links = []
        while pages > 0:
            html = HTML(html=r.text)
            news_list_body = html.find('#news_list_body', first=True)
            links.extend(list(news_list_body.links))

            page_link_next = html.find('div > gonext > a[data-id=right]',
                                       first=True).attrs['href']

            r = session.get(url=url_base + page_link_next, headers=headers)
            pages += -1

        return links
示例#6
0
def parse(data_object):
    # get the html data
    html_data = HTML(html=data_object)

    # get the table with contain the data
    css_class = ".result-count"
    result_count = html_data.find(css_class)
    total_forclosures = (re.findall('\d+', result_count[0].text))[0]
    print(total_forclosures)
    return total_forclosures
示例#7
0
def get_script_sources(url: str, body: str) -> List[str]:
    html = HTML(html=body)

    sources: List[str] = []
    for script in html.find("script"):
        source = script.attrs.get("src")
        if source is not None:
            sources.append(normalize_source(url, source))

    return list(set(sources))
示例#8
0
def get_push_data_from_article_data_date(url):  #2
    resp = fetch(url)
    html = HTML(html=resp.text)
    post_entries = html.find('div.push')
    data = []
    for entry in post_entries:
        check = parse_push_data(entry)
        if check != None:
            data.append(check)
    return data
示例#9
0
 def parse_product_list(self, html):
     html = HTML(html=html)
     tbody_list = html.find(
         '#main > div.mycomment-bd > div.mycomment-table > table > tbody')
     lists = []
     for _tbody in tbody_list:
         _tbody = HTML(html=_tbody.html)
         product_url = furl(
             _tbody.find(
                 'tr.tr-bd > td:nth-child(1) > div.goods-item > div.p-msg > div > a'
             )[0].attrs['href'])
         order_id = _tbody.find('tr.tr-th > td > span.number > a')[0].text
         lists.append({
             'product_id':
             int(str(product_url.path).strip('/').strip('.html')),
             'order_id':
             order_id
         })
     return lists
示例#10
0
    def check_login(self):
        ''' 检测登录 '''
        result = self.session.get(self.HOME_URL, verify=False)

        html = HTML(html=result.text)

        if html.find('title', first=True).text == '我的京东':
            return True
        else:
            return False
示例#11
0
def fetch_image_links(url):
    response = requests.get(url)
    response = requests.get(url, cookies={'over18': '1'})
    html = HTML(html=response.text)
    content_entries = html.find('a')        #藉由print(response.text)可以發現圖片連結都在Element 'a'的樹裡,而且不屬於別的Element底下
    img_urls = []
    for content in content_entries:
        if re.match(r'^https?://(i.)?(m.)?imgur.com', content.attrs['href']):
            img_urls.append(content.attrs['href'])
    return img_urls
def scrape_data(url):
    # Open the url in the browser(hidden) using selenium
    browser.get(url)

    # Timeout between each query to prevent errors and amazon might block account if too fast
    time.sleep(2)

    # Get all the HTML data from the website,i.e inside the <body> tag
    html_data = browser.find_element_by_css_selector('body').get_attribute(
        "innerHTML")
    # Convert the data to usable data. HTML() is imported from requests_html
    html_str = HTML(html=html_data)

    avbl = html_str.find("#availability")[0].text
    title_el = html_str.find('#productTitle')[0].text

    #This will contain all the data for the particular item
    data_el = []

    if not avbl.startswith("Currently"):
        price_el = html_str.find('#priceblock_ourprice')[0].text
        try:
            saving_el = html_str.find("#regularprice_savings")[0].text.split(
                '(')[1].split(')')[0]
        except:
            saving_el = "0%"

        data_el.append({
            "Title": title_el,
            "Price": price_el,
            "Available": avbl,
            "Saving": saving_el
        })
    else:
        avbl = avbl.split('.')[0]
        data_el.append({
            "Title": title_el,
            "Price": "NA",
            "Available": avbl,
            "Saving": "NA"
        })

    return data_el
示例#13
0
async def get_tickets(page, session_params):
    s, webidx, terminal = session_params
    url = URL.format(terminal, webidx, page)
    async with s.get(url) as resp:
        h = await resp.text()
        html = HTML(html=h)
        table = html.find('table')[1].html
        df = pd.read_html(table, header=2)[0].dropna(
            axis=1, how='all').assign(terminal=terminal)
        return df
示例#14
0
def get_trends(proxies=None):
    session = HTMLSession()
    html = session.get("https://twitter.com/i/trends",
                       headers=get_headers(),
                       proxies=proxies)
    html = html.json()["module_html"]
    html = HTML(html=html, url="bunk", default_encoding="utf-8")
    for trend_item in html.find("li"):
        trend_text = trend_item.attrs["data-trend-name"]
        yield trend_text
示例#15
0
class HTMLVotesParser:
    def __init__(self, html):
        self.html = HTML(html=html)
        self.date = None
        self.date_votes = None
        self.topic = None
        self.kind = None

    def next_td(self):
        for tr in self.html.find("tr"):
            for td in tr.find("td"):
                classes = td.attrs.get("class", ())
                yield td, classes

    def parse(self) -> VoteList:
        student = ""
        dates = []
        for td, classes in self.next_td():
            text = td.text
            if self._is_student(classes):
                student = td.find("span")[2].text
            if self._is_new_day(classes):
                self._init_new_day(dates, text)

            elif self._is_processing_day():
                self._process_day(classes, text)
        if self.date:
            dates.append((self.date, self.date_votes))
        return VoteList(student=student, votes=dates)

    def _process_day(self, classes, text):
        if "intestazioni" in classes:
            if not self.topic:
                self.topic = text
            else:
                self.kind = text
        elif "voto_" in classes:
            vote = Vote(self.topic, self.kind, text)
            self.topic = None
            self.date_votes.append(vote)

    def _is_processing_day(self):
        return self.date is not None

    def _init_new_day(self, dates, new_date):
        if self.date:
            dates.append((self.date, self.date_votes))
        self.date = new_date
        self.date_votes = []

    def _is_new_day(self, classes):
        return "registro" in classes

    def _is_student(self, classes):
        return "page-usr-user" in classes
示例#16
0
def parse_post_entries(doc):
    html = HTML(html=doc)
    post_entries = html.find('#main-content', first=True).text
    post_content = post_entries.split('※ 發信站: 批踢踢實業坊(ptt.cc)')[0]
    post_content = post_content.split('\n')
    if (len(post_content) == 5):
        content = post_content[4]
    else:
        content = post_content[5]

    return content
示例#17
0
def get_token(content):
    """Method to get the token.

    Args:
        content (str): text content of the html request.
    Returns:
        token (str): token extracted from html content.
    """
    html = HTML(html=content)
    token = html.find('input', first=True).attrs.get('value')
    return token
    def gen_tweets(cv_url,r):
        try:
            html = HTML(html=r.json()['items_html'],
                        url='bunk', default_encoding='utf-8')
        except KeyError:
            raise ValueError(
                f'Oops! Either "{user}" does not exist or is private.')

        comma = ","
        dot = "."
        tweets = []
        for tweet in html.find('.stream-item'):
            twwtext = tweet.find('.tweet-text')
            if len(twwtext)>0:
                text = twwtext[0].full_text
            else:
                continue
            tweetId = tweet.find(
                '.js-permalink')[0].attrs['data-conversation-id']

            href = tweet.find(
                '.js-permalink')[0].attrs['href']
            tweetFrom = href.split('/')[1]

            time = datetime.fromtimestamp(
                int(tweet.find('._timestamp')[0].attrs['data-time-ms'])/1000.0 + 8*60*60)#加8h使其显示cn时间
            interactions = [x.text for x in tweet.find(
                '.ProfileTweet-actionCount')]
            replies = int(interactions[0].split(" ")[0].replace(comma, "").replace(dot,""))
            retweets = int(interactions[1].split(" ")[
                            0].replace(comma, "").replace(dot,""))
            likes = int(interactions[2].split(" ")[0].replace(comma, "").replace(dot,""))
            hashtags = [hashtag_node.full_text for hashtag_node in tweet.find('.twitter-hashtag')]
            urls = [url_node.attrs['data-expanded-url'] for url_node in tweet.find('a.twitter-timeline-link:not(.u-hidden)')]
            photos = [photo_node.attrs['data-image-url'] for photo_node in tweet.find('.AdaptiveMedia-photoContainer')]
            
            videos = []
            video_nodes = tweet.find(".PlayableMedia-player")
            for node in video_nodes:
                styles = node.attrs['style'].split()
                for style in styles:
                    if style.startswith('background'):
                        tmp = style.split('/')[-1]
                        video_id = tmp[:tmp.index('.jpg')]
                        videos.append({'id': video_id})
            tweets.append({'tweetId': tweetId, 'time': time, 'text': text,'cv_url':cv_url,
                            'replies': replies, 'retweets': retweets, 'likes': likes, 'isRetweet':tweetFrom == cv_url,
                            'entries': {
                                'hashtags': hashtags, 'urls': urls,
                                'photos': photos, 'videos': videos
                            }
                            })

        return tweets
def scrape_product_page(url,
                        title_lookup="#productTitle",
                        price_lookup="#priceblock_ourprice"):
    driver.get(url)
    time.sleep(3)
    body = driver.find_element_by_css_selector('body')
    html_str = body.get_attribute('innerHTML')
    html_obj = HTML(html=html_str)
    product_title = html_obj.find(title_lookup, first=True).text
    product_price = html_obj.find(price_lookup, first=True).text
    return product_title, product_price
示例#20
0
    def gen_tweets(pages):
        r = session.get(url, headers=headers)

        while pages > 0:
            try:
                html = HTML(html=r.json()['items_html'], url='bunk', default_encoding='utf-8')
            except KeyError:
                raise ValueError(
                    f'Oops! Either "{user}" does not exist or private.')

            tweets = [tweet.full_text for tweet in html.find('.tweet-text')]
            last_tweet = html.find('.stream-item')[-1].attrs['data-item-id']

            for tweet in tweets:
                if tweet:
                    yield re.sub('http', ' http', tweet, 1)

            r = session.get(
                url, params={'max_position': last_tweet}, headers=headers)
            pages += -1
示例#21
0
def parse(html: HTML) -> List[Metric]:
    """Scrape metrics tiles from page."""
    stats_cards = html.find(".stats-cards__container", first=True)
    tiles = stats_cards.find(".stats-cards__item")
    metrics = [
        Metric(
            label=tile.find(".stats-cards__label", first=True).text,
            value=tile.find(".stats-cards__number", first=True).text,
        ) for tile in tiles
    ]
    return metrics
示例#22
0
    def get_board_list(self, resp):
        boards = []
        html = HTML(html=resp.text)
        board_entries = html.find('div.b-ent')

        for idx, entry in enumerate(board_entries):
            meta = self.parser_boardlist_meta(entry)
            pretty_print_board(idx, meta['name'], meta['class'], meta['title'])
            boards.append(meta['name'])

        return boards
示例#23
0
def main():
    resp = fetch(url=url)
    if resp.status_code == 200:
        html = HTML(html=resp.text)
        post_entries = html.find('div.r-ent')

        for entry in post_entries:
            meta = parser_article_meta(entry)
            print(meta)
    else:
        print(resp.status_code)
示例#24
0
 def parse(text):
     html = HTML(html=text)
     profile_url = html.find("a.user_avatar", first=True)
     if not profile_url:
         logging.error("Can not parse backend response - no a.user_avatar")
         raise UnknownBackendResponse()
     try:
         return profile_url.attrs["href"]
     except KeyError:
         logging.exception("Can not parse backend response")
         return UnknownBackendResponse()
示例#25
0
async def _get_city(ip):
    url = 'http://www.ip138.com/ips138.asp'
    async with aiohttp.ClientSession() as session:
        # async with async_timeout.timeout(10):
        async with session.get(url, params={"ip": ip}) as response:
            txt = await response.text()
            html = HTML(html=txt)
            html.encoding = "utf-8"
            lis = html.find('li')
            city = lis[0].text.split(":")[-1].split(" ")[0]
            return city
示例#26
0
def html_parser(html: HTML) -> Dict:
    """Parses HTML element into individual sections

    Given an html element the html_parser will search for each profile section using
    CSS selectors. All parsed html elements are gathered into a dictionary and returned.

    Args:
        html: HTML element from a successful nitter profile scraped response.

    Returns:
        A dictionary of found elements from the parsed sections.

    """
    elements = {}

    elements["username"] = html.find(".profile-card-username", first=True)

    elements["name"] = html.find(".profile-card-fullname", first=True)

    elements["biography"] = html.find(".profile-bio", first=True)

    elements["location"] = html.find(".profile-location", first=True)

    elements["is_verified"] = html.find(
        ".profile-card-fullname .icon-container .verified-icon", first=True
    )

    elements["profile_photo"] = html.find(".profile-card-avatar", first=True)

    elements["banner_photo"] = html.find(".profile-banner a", first=True)

    elements["website"] = html.find(".profile-website", first=True)

    profile_statlist = html.find(".profile-statlist", first=True)

    elements["tweets_count"] = profile_statlist.find(".posts .profile-stat-num", first=True)

    elements["following_count"] = profile_statlist.find(".following .profile-stat-num", first=True)

    elements["followers_count"] = profile_statlist.find(".followers .profile-stat-num", first=True)

    elements["likes_count"] = profile_statlist.find(".likes .profile-stat-num", first=True)

    elements = {k: v for k, v in elements.items() if v is not None}

    return elements
示例#27
0
        def parse(text, user_profile_url):
            html = HTML(html=text)
            # find persona_name
            div = html.find("div.profile_header_centered_persona", first=True)
            if not div:
                fallback_div = html.find("div.welcome_header_ctn")
                if fallback_div:
                    logger.info("Fresh account without set up steam profile.")
                    raise UnfinishedAccountSetup()
                logger.error(
                    "Can not parse backend response - no div.profile_header_centered_persona"
                )
                raise UnknownBackendResponse()
            span = div.find("span.actual_persona_name", first=True)
            if not span:
                logger.error(
                    "Can not parse backend response - no span.actual_persona_name"
                )
                raise UnknownBackendResponse()
            persona_name = span.text

            # find steam id
            variable = 'g_steamID = "'
            start = text.find(variable)
            if start == -1:
                logger.error(
                    "Can not parse backend response - no g_steamID variable")
                raise UnknownBackendResponse()
            start += len(variable)
            end = text.find('";', start)
            steam_id = text[start:end]

            # find miniprofile id
            profile_link = f'{user_profile_url}" data-miniprofile="'
            start = text.find(profile_link)
            if start == -1:
                logger.error(
                    "Can not parse backend response - no steam profile href")
                raise UnknownBackendResponse()

            return steam_id, persona_name
示例#28
0
def process_page(text):
    html = HTML(html=text)
    item_css = '#content  ol.grid_view > li'
    items = html.find(item_css)
    rank_css = 'em'
    title_css = '.info  span.title'
    score_css = '.info  .rating_num'
    for item in items:
        rank = int(item.find(rank_css, first=True).text)
        title = item.find(title_css, first=True).text
        score = float(item.find(score_css, first=True).text)
        movies_250.append(Movie(rank, score, title))
def get_page_count():
    """ Get the total number of pages (as integer) from the main "Auction History" page: """
    time.sleep(1)
    main_r = requests.get(root_url, headers=request_headers)
    if main_r.status_code == 200:
        main_r_html = HTML(html=main_r.text)
        page_numbers = main_r_html.find(".PageLink")
        main_r.close()
        max_page = int(list(page_numbers[-1].links)[0].split("page=")[-1])
        return max_page
    else:
        return 0
示例#30
0
    def parse(self, html: HTML) -> [ProxyIP]:
        ip_list: [ProxyIP] = []

        for ip_row in html.find('.proxylist tbody tr'):
            ip_port = ip_row.find('td:nth-child(1)', first=True).text
            ip_address, port = ip_port.split(":")

            p = ProxyIP(ip=ip_address, port=port)

            ip_list.append(p)

        return ip_list
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
def test_bare_js_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc)
    html.render()

    assert html.find('#replace', first=True).text == 'yolo'
示例#33
0
文件: commands.py 项目: kylef/goji
def submit_form(session, response, data=None):
    from requests_html import HTML
    html = HTML(url=response.url, html=response.text)

    forms = html.find('form')
    if len(forms) == 0:
        raise Exception('Page does have any forms')

    form = forms[0]
    url = form.attrs['action']
    fields = form.find('input')

    data = data or {}

    for field in fields:
        name = field.attrs['name']

        if name not in data:
            value = field.attrs['value']
            data[name] = value

    response = session.post(urljoin(response.url, url), data=data)
    return response
示例#34
0
def _weblint_html(path: pathlib.Path, doctype: str) -> set:
    '''HTML Lint for WebLint.
    '''

    DEPRECATED_TAGS = {
        'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame',
        'frameset', 'noframes', 'acronym', 'big', 'u', 'isindex', 'basefont',
        'dir', 'applet', 'style',
    }

    REQUIRED_TAGS = {
        'html': (
            (('head', '==', 1), 'HS0013'),
            (('body', '==', 1), 'HS0014'),
        ),
        'head': (
            (('title', '==', 1), 'HS0015'),
            (('meta', '>=', 1), 'HS0018'),
            (('script', '==', 0), 'HP0001'),
        ),
        'ul': (
            (('li', '>=', 1), 'HS0019'),
        ),
        'ol': (
            (('li', '>=', 1), 'HS0020'),
        ),
        'select': (
            (('option', '>=', 1), 'HS0021'),
        ),
        'dl': (
            (('dt', '>=', 1), 'HS0022'),
            (('dd', '>=', 1), 'HS0023'),
        ),
        'video': (
            (('source', '>=', 1), 'HS0024'),
        ),
        'audio': (
            (('source', '>=', 1), 'HS0026'),
        ),
        'details': (
            (('summary', '==', 1), 'HS0029'),
        ),
        'aside': (
            (('main', '==', 0), 'HA0006'),
        ),
        'figure': (
            (('figcaption', '==', 1), 'HS0044'),
        ),
    }

    SELFCLOSED_TAGS = {
        'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen',
        'link', 'meta', 'output', 'param', 'track', 'wbr', 'source',
    }

    CLOSE_TAGS = {
        'a', 'abbr', 'address', 'article', 'aside', 'audio',
        'bdi', 'bdo', 'blockquote', 'body', 'button',
        'canvas', 'caption', 'cite', 'code', 'col', 'colgroup',
        'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div',
            'dl', 'dt',
        'em',
        'fieldset', 'figure', 'figcaption', 'footer', 'form',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html',
        'ins',
        'kbd',
        'label', 'legend', 'li',
        'main', 'map', 'menu', 'menuitem', 'meter',
        'nav', 'noscript',
        'object', 'ol', 'option', 'optgroup',
        'p', 'picture', 'pre', 'progress',
        'q',
        'rb', 'rp', 'rt', 'rtc', 'ruby',
        'samp', 'script', 'section', 'select', 'span', 'strong',
            'sub',  'summary', 'sup',
        'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time',
            'title', 'tfoot', 'tr',
        'ul',
        'var', 'video'
    }

    DEPRECATED_ATTRS = {
        'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink',
        'text', 'background', 'bgcolor', 'border', 'char', 'charoff',
        'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules',
        'valign', 'accept', 'vspace',
    }

    GLOBAL_ATTRS = {
        'lang', 'id', 'class', 'title', 'hidden',
    }

    VALID_ATTRS = {
        'charset', 'name', 'src', 'content', 'controls', 'type', 'href',
        'alt', 'rel', 'value', 'min', 'max',
    }

    BOOL_ATTRS = {
        'controls', 'hidden',
    }

    REQUIRED_ATTRS = {
        'html': (('lang',), 'HS0012'),
        'video': (('controls',), 'HS0027'),
        'source': (('src', 'type'), 'HS0025'),
        'audio': (('controls',), 'HS0028'),
        'a': (('href',), 'HS0031'),
        'img': (('src',), 'HS0033'),
        'input': (('type',), 'HS0035'),
        'link': (('rel', 'href'), 'HS0040'),
        'script': (('src',), 'HS0042'),
        'progress': (('value', 'max'), 'HS0045'),
        'meter': (('value', 'min', 'max'), 'HS0046'),
    }

    REQUIRED_ATTRS_ACCESS = {
        'img': (('alt',), 'HA0001'),
        'a': (('title',), 'HA0007'),
    }

    NOEMPTY_TAGS = {
        ('title', 'HS0016'),
        ('p', 'HS0017'),
        ('summary', 'HS0030'),
        ('a', 'HS0032'),
        ('video', 'HA0002'),
        ('audio', 'HA0003'),
        ('h1', 'HS0036'),
        ('h2', 'HS0036'),
        ('h3', 'HS0036'),
        ('h4', 'HS0036'),
        ('h5', 'HS0036'),
        ('h6', 'HS0036'),
        ('meter', 'HA0008'),
    }

    class _StdHTMLParser(HTMLParser):
        def handle_decl(self, data):
            self.doctype = data
            self.not_paired_tags = []
            self._start_tags = []
            self.duplicated_attrs = []
            self.tag_not_lowercase = []
            self.empty_tags_not_closed = []

        def handle_starttag(self, tag, attrs):

            # tag name must be in lowercase
            # Python standard module "html.parser" covert tag name from uppercase
            # to lowercase already.
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self._start_tags.append(tag)
            else:
                self.empty_tags_not_closed.append((tag, self.lineno))
            self._handle_attrs(attrs)

        def handle_endtag(self, tag):
            if tag == self._start_tags[-1]:
                self._start_tags.pop()
            else:
                if tag not in self._start_tags:
                    self.not_paired_tags.append((tag, self.lineno))
                else:
                    for t in reversed(self._start_tags):
                        if t != tag:
                            self.not_paired_tags.append((t, self.lineno))
                        else:
                            self._start_tags.pop()
                            break

        def handle_startendtag(self, tag, attrs):
            # tag name must be in lowercase
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self.not_paired_tags.append((tag, self.lineno))
            self._handle_attrs(attrs)

        def _handle_attrs(self, attrs):
            attrnames = [a[0] for a in attrs]
            for a in attrs:
                name, _ = a

                # validate duplicated attributes
                c = attrnames.count(name)
                if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs:
                    self.duplicated_attrs.append((f'{name} {c}', self.lineno))

        def _raw_tag(self):
            lineno, pos = self.getpos()
            rawline = self.rawdata.splitlines()[lineno-1]
            return rawline[pos+1:pos+1+len(self.lasttag)]

    try:
        with path.open() as f:
            doc = f.read()
    except FileNotFoundError:
        return {Report('G00001', path, 0, '')}
    reports = set()

    # validate DOCTYPE, using standard HTML parser since
    # requests-html ignore handling the DOCTYPE
    lineno = 1
    obj = 'DOCTYPE'
    std_parser = _StdHTMLParser()
    std_parser.feed(doc)
    try:
        if std_parser.doctype != doctype:
            reports.add(Report('HS0002', path, lineno, obj))
            return reports

        rules = {
            'not_paired_tags': 'HS0005',
            'empty_tags_not_closed': 'HS0006',
            'duplicated_attrs': 'HS0009',
            'tag_not_lowercase': 'HS0010',
        }
        for a, e in rules.items():
            # no need to check attr exists,
            # since doctype has been checked before
            for t in getattr(std_parser, a):
                reports.add(Report(e, path, t[1], t[0]))

    except AttributeError:
        reports.add(Report('HS0001', path, lineno, obj))
        return reports
    finally:
        std_parser.close()

    all_ids = set()
    parser = HTML(html=doc)
    for element in parser.find():
        lxml_element = element.element
        tag = lxml_element.tag
        lineno = lxml_element.sourceline
        if tag in DEPRECATED_TAGS:
            reports.add(Report('HS0004', path, lineno, tag))
        elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS:
            reports.add(Report('HS0003', path, lineno, tag))
        else:
            pass
        
        # validate required elements
        rules = REQUIRED_TAGS.get(tag)
        if rules is not None:
            for r in rules:
                if eval(f'not len(element.find(r[0][0])) {r[0][1]} r[0][2]'):
                    reports.add(Report(r[1], path, lineno, r[0][0]))

        # validate required attributes
        rules = REQUIRED_ATTRS.get(tag)
        if rules is not None:
            for r in rules[0]:
                if r not in (a.lower() for a in element.attrs):
                    reports.add(Report(rules[1], path, lineno, r))

        # validate accessibility attributes
        rules = REQUIRED_ATTRS_ACCESS.get(tag)
        if rules is not None:
            for r in rules[0]:
                if r not in (a.lower() for a in element.attrs):
                    reports.add(Report(rules[1], path, lineno, r))

        # parse attributes
        for a, v in element.attrs.items():
            a_lower = a

            # validate attribute name must be in lowercase
            if not a.islower():
                reports.add(Report('HS0011', path, lineno, a))
                a_lower = a.lower()

            if a_lower in DEPRECATED_ATTRS:
                reports.add(Report('HS0008', path, lineno, a))
            elif a_lower not in GLOBAL_ATTRS | VALID_ATTRS:
                reports.add(Report('HS0007', path, lineno, a))
            
            # validate attribute's value is NOT empty
            if not v and a_lower not in BOOL_ATTRS:
                reports.add(Report('HS0034', path, lineno, a))

            if a_lower == 'id':
                if v in all_ids:
                    reports.add(Report('HS0037', path, lineno, f'id="{v}"'))
                all_ids.add(v)

    for t in NOEMPTY_TAGS:
        for e in parser.find(t[0]):
            if not e.text:
                reports.add(Report(t[1], path, e.element.sourceline, e.element.tag))

    # `<h1>` element must be present only once
    h1_list = parser.find('h1')
    if len(h1_list) > 1:
        e = h1_list[-1].element
        reports.add(Report('HA0004', path, e.sourceline, e.tag))

    # <main> element without "hidden" attribute must be present only once
    main_list = parser.find('main')
    main_count = len(main_list)
    main_hidden_count = len(parser.find('main[hidden]'))
    if main_count - main_hidden_count != 1:
        for e in main_list:
            reports.add(Report('HS0038', path, e.element.sourceline, 'main'))

    # <meta> element with "charset" attribute must be present only once
    meta_charset_list = parser.find('meta[charset]')
    meta_charset_count = len(meta_charset_list)
    if not meta_charset_count:
        reports.add(Report('HS0018', path, 0, 'meta charset'))
    elif meta_charset_count > 1:
        for e in meta_charset_list:
            obj = f'meta charset {meta_charset_count}'
            reports.add(Report('HS0009', path, e.element.sourceline, obj))

    # <input> element with "type=image" must have "src" and "alt" atrributes
    for e in parser.find('input[type="image"]'):
        if 'src' not in e.attrs:
            reports.add(Report('HS0039', path, e.element.sourceline, 'src'))
        if 'alt' not in e.attrs:
            reports.add(Report('HA0005', path, e.element.sourceline, 'alt'))

    # <link> element must **NOT** have `type` attribute with value of `text/css`
    for e in parser.find('link[rel="stylesheet"]'):
        assert 'href' in e.attrs
        if e.attrs['href'].endswith('css'):
            if 'type' in e.attrs and e.attrs['type'] == 'text/css':
                l = e.element.sourceline
                reports.add(Report('HS0041', path, l, 'type'))

    # <script> element must **NOT** have `type` attribute with value of `text/javascript`
    for e in parser.find('script[type="text/javascript"]'):
        l = e.element.sourceline
        reports.add(Report('HS0043', path, l, 'type'))

    return reports