示例#1
0
def parse_and_extract(url: str, year: int = current_year):
    html_txt = url_to_file(url)

    r_html = HTML(html=html_txt)
    table_class = ".imdb-scroll-table"
    r_table = r_html.find(table_class)
    # print(r_table)

    table_data = []
    # table_dict_data = {}
    headers = []

    if len(r_table) == 1:
        # print(r_table[0].text)
        parsed_table = r_table[0]
        rows = parsed_table.find('tr')
        header_row = rows[0]
        header_col = header_row.find('th')
        headers = [head.text for head in header_col]

        for row in rows[1:]:
            # print(row.text)
            cols = row.find('td')
            row_data = []
            # row_dict_data = {}
            for i, col in enumerate(cols):
                # print(i, col.text, '\n\n')

                # header_name = header_names[i]
                # row_dict_data[header_name] = col.text

                row_data.append(col.text)
            table_data.append(row_data)
            # table_dict_data.append(row_dict_data)

    # print(headers)
    # print(table_data[0])

    path = os.path.join(BASE_DIR, 'data')
    os.makedirs(path, exist_ok=True)
    filepath = os.path.join(path, f'{year}.csv')

    df = pd.DataFrame(table_data, columns=headers)
    # df = pd.DataFrame(table_dict_data)
    df.to_csv(filepath, index=False)
示例#2
0
def parse_and_extract(name=None):
    html_text = url_to_text(save=True, name=name)
    r_html = HTML(html=html_text)
    table_class = ".table"
    r_table = r_html.find(table_class)

    #table_data = []
    table_data_dicts = []
    header_names = []
    if len(r_table) == 0:
        return False
    parsed_table = r_table[0]
    rows = parsed_table.find("tr")
    header_row = rows[0]
    header_cols = header_row.find("th")

    for x in header_cols:
        header_names.append(x.text)
    #header_names = [x.text for x in header_cols]
    #print(header_names)

    for row in rows[1:]:
        cols = row.find("td")
        #row_data =[]
        row_dict_data = {}
        for i, cols in enumerate(cols):
            #print(i, cols.text, "\n")
            """dictionaries can be used in case all the header_names are unique 
            but in case of common header names the data will be overwritten for 
            programme with lists refer to scrape.py"""
            header_name = header_names[i]
            row_dict_data[header_name] = cols.text
            #row_data.append(cols.text)

        print(row_dict_data)
        #table_data.append(row_data)
        table_data_dicts.append(row_dict_data)
    #df = pd.DataFrame(table_data, columns = header_names)
    df = pd.DataFrame(table_data_dicts)

    path = os.path.join(Base_dir, 'data')
    os.makedirs(path, exist_ok=True)
    filepath = os.path.join(path, f"{name}.csv")
    df.to_csv(filepath, index=False)
    return True
示例#3
0
 def parse_html(self):
     html = HTML(html=self.origin_request.content.decode(
         'utf-8', 'ignore').encode('utf-8'),
                 url=self.origin_request.url)
     for item in html.find('.latnewslist'):
         url = item.absolute_links.pop()
         origin_id = int(url.split('/')[-1])
         _id = "{0}_{1}".format(self.origin, origin_id)
         title = item.find('.entry > a > h3', first=True).text.strip()
         if _id not in self.done_ids and self.is_title_needed(title):
             notice = {
                 'id': _id,
                 'url': url,
                 'origin': constants.ORIGINS[self.origin],
                 'origin_id': origin_id,
                 'title': title,
             }
             notice_detail = HTML(html=self.session.get(url).content.decode(
                 'utf-8', 'ignore').encode('utf-8'),
                                  url=url)
             notice['content'] = notice_detail.find('.dtl-content',
                                                    first=True).text
             end_index = notice['content'].index('上一篇')
             if end_index > 0:
                 notice['content'] = notice['content'][:end_index]
             notice['posted_at'] = int(
                 datetime.datetime.strptime(
                     notice_detail.find('.new-dtl-info > span',
                                        first=True).text[:19],
                     '%Y-%m-%d %H:%M:%S').timestamp())
             notice['short_content'] = item.find('.news-brief',
                                                 first=True).text
             self.update_line(notice)
示例#4
0
def parse():

    session = HTMLSession()

    r = session.get(REQUEST_URL, headers={"accept": "application/json"})

    threads = r.json().get("threads")

    parsed_data = []
    
    for thread in threads[1::]:

        timestamp = thread.get("posts")[0].get("timestamp")
        timestamp = datetime.fromtimestamp(timestamp).astimezone(tz)

        if timestamp > datetime.now().astimezone(tz) - timedelta(hours=1):
            
            thread_number = thread.get("thread_num")
            
            thread_subject = thread.get("posts")[0].get("subject")

            text = thread.get("posts")[0].get("comment")
            text = HTML(html = text)
            links = text.find("a")
            text = text.html
            text = text.replace("<br>", "\n") # replacing line breaks to markdown ones
            text = bs4.BeautifulSoup(text, features="lxml").get_text()

            # replacing html links to markdown ones
            for link in links:
                href = link.attrs["href"]
                text = text.replace(href, ("[" + href + "](" + href + ")"))

            thread_files = thread.get("posts")[0].get("files")

            thread_files = [thread_file.get("path") for thread_file in thread_files] if thread_files else None
                
            thread_link = "2ch.hk/news/res/" + str(thread_number) + ".html"

            text = __format_text(text, thread_subject, thread_link) 
            
            parsed_data.append(ThreadInfo(thread_number, timestamp, thread_subject, text, thread_files, thread_link))
            
        
    return parsed_data
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
示例#6
0
def extract_data(html_text):
    r_html = HTML(html=html_text)
    table_id = '#customers'
    r_table = r_html.find(table_id)
    parsed_table = r_table[0]
    rows = parsed_table.find('tr')
    header_row = rows[0]
    header_names = [col.text for col in header_row.find('th')]

    table_data = []
    for row in rows[1:]:
        cols = row.find('td')
        row_data = []
        for col in cols:
            row_data.append(col.text)
        table_data.append(row_data)

    return header_names, table_data
示例#7
0
    def get_overall_stats(self):

        html = HTML(html=self.page)
        counters = html.find('#maincounter-wrap')
        main_counter = []
        cases = {}
        for entry in counters:
            main_counter.append(
                (int(entry.text.split('\n')[1].replace(',', ''))))

        cases["Total cases"] = main_counter[0]
        cases["Total recovered"] = main_counter[2]
        cases["Total deaths"] = main_counter[1]
        cases["Active cases"] = cases["Total cases"] - cases[
            "Total recovered"] - cases["Total deaths"]
        cases["Mortality ratio"] = round(
            (cases["Total deaths"] * 100 / cases["Total cases"]), 2)
        return (cases)
示例#8
0
    def test_get_items_from_page(self, HTMLSession):
        html = HTML(html=load_html('basepage'))
        HTMLSession.return_value.get.return_value = DummyResponse(html)

        spell = self._create_spell({'itemListSelector': '#unselect'})
        result = spell._get_items_from_page('test_url')

        for item in result:
            assert item
def test_bare_js_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc)
    html.render()

    assert html.find('#replace', first=True).text == 'yolo'
示例#10
0
 def test_html_to_list_valid_html(self):
     """Should convert requests_html.HTML instance to `list`"""
     html = HTML(html=" \
         <tr><th>A</th><th>V</th></tr> \
         <tr><td>a1</td><td>v1</td></tr> \
         <tr><td>a2</td><td>v2</td></tr> \
     ")
     self.assertEqual(html_to_list(html),
                      [['A', 'V'], ['a1', 'v1'], ['a2', 'v2']])
示例#11
0
    def _get_next_page(self, current_page: HTML):
        next_page_href = current_page.find('a', containing='>>', first=True)
        next_page_html = self._get_url_content('{url}{next_page}'.format(
            url=self.url, next_page=next_page_href.attrs['href']))

        if current_page.url == next_page_html.url:
            return None

        return next_page_html
示例#12
0
        def parse(text):
            html = HTML(html=text)
            # find login
            pulldown = html.find("#account_pulldown", first=True)
            if not pulldown:
                raise UnknownBackendResponse()
            login = pulldown.text

            # find steam id
            variable = 'g_steamID = "'
            start = text.find(variable)
            if start == -1:
                raise UnknownBackendResponse()
            start += len(variable)
            end = text.find('";', start)
            steam_id = text[start:end]

            return steam_id, login
示例#13
0
def to_dict(htmltext):
    # if not isinstance(htmltext, six.string_types):
    #     raise TypeError("Except str got {}".format(type(htmltext).__name__))
    script_str = 'define("detail"'
    script_str1 = "define('detail'"
    text = HTML(html=htmltext)
    try:
        key_word_obj = text.find("script", containing=script_str)[0].text
    except:
        key_word_obj = text.find("script", containing=script_str1)[0].text
    key_word_obj = key_word_obj.replace("\\", "")
    key_word = re.search(r'return(.*?})', key_word_obj, re.S).group(1)
    key_word = key_word.replace("\\", "")
    keyword_dict = demjson.decode(json.dumps(key_word, ensure_ascii=False))
    if isinstance(keyword_dict, str):
        keyword_dict = demjson.decode(keyword_dict)
    print(type(keyword_dict))
    return keyword_dict
示例#14
0
        def parse(text):
            html = HTML(html=text)
            rows = html.find(".achieveRow")
            achievements = []
            try:
                for row in rows:
                    unlock_time = row.find(".achieveUnlockTime", first=True)
                    if unlock_time is None:
                        continue
                    unlock_time = int(
                        self.parse_date(unlock_time.text).timestamp())
                    name = row.find("h3", first=True).text
                    achievements.append((unlock_time, name))
            except (AttributeError, ValueError, TypeError):
                logging.exception("Can not parse backend response")
                raise UnknownBackendResponse()

            return achievements
示例#15
0
def parse_article_content(doc):
    html=HTML(html=doc)
    entry = html.find('div.bbs-screen')
    content = entry[0].text
    content = content.replace("【1.請注意兩日內僅能徵、賣、估各1篇,切勿2PO or 以上   】","")
    content = content.replace("【2.非本板討論範圍請勿PO文(詳細規定請看置底板規)       】","")
    content = content.replace("【3.確定無誤再發文,發現有誤請大T修標題大E修內文       】","")
    content = content.replace("【4.無用的整行文字 (例此行以上) 可按「Ctrl+Y」刪除整行】","")
    content = content.replace("【5.賣出後勿清空內文、標題、價格,違者水桶2個月        】","")
    content = content.replace("【6.勿刪除他人推文,違者退文並水桶1個月                】","")
    content = content.replace("【7.請 先 按 「Ctrl+V」!!  還原色碼後,方可正常編輯   】","")
    content = content.replace("(沒有明確價格、賣出後清空價格,水桶2個月)","")
    content = content.replace("(購買日期、保固有無、使用期間、新舊程度)","")
    content = content.replace("(官方規格、網拍連結、實物品樣照片)","")
    content = content.replace("(自取、面交、郵寄、宅急便)","")
    content = content.replace("(限面交者請交待詳細區域地點!!)","")
    content = content.replace("(站內信、手機、即時通訊軟體、如何稱呼)","")
    return content
示例#16
0
def get_page_count():
    """ Get the total number of pages (as integer) from the main "Auction History" page: """
    global max_page_hour

    time.sleep(1)
    main_r = requests.get(root_url, headers=request_headers)
    if main_r.status_code == 200:
        main_r_html = HTML(html=main_r.text)
        page_numbers = main_r_html.find(".PageLink")
        main_r.close()
        max_page = int(list(page_numbers[-1].links)[0].split("page=")[-1])

        max_page_hour = datetime.datetime.now().hour

        return max_page
    else:
        main_r.close()
        return 0
示例#17
0
    def test_create_task(self):
        """Check if task will be created."""
        self.client.post("/new-task",
                         data=dict(task_id="foobar",
                                   description="barfoo",
                                   src="http://foo.bar/mets.xml",
                                   workflow_id=self.get_workflow_id(),
                                   default_file_grp="file_grp"))
        task = models.Task.get()

        response = self.client.get("/tasks")
        html = HTML(html=response.data)
        assert len(html.find('table > tr > td')) == COLUMN_COUNT
        assert html.find('table > tr > td')[2].text == "file_grp"
        status_col_txt = html.find('table > tr > td')[6].text
        assert status_col_txt.startswith("CREATED")
        assert "worker_task.id" in status_col_txt
        self.client.get(f"/task/delete/{ task.uid }")
示例#18
0
def test_bare_js_eval():
    doc = """
    <!DOCTYPE html>
    <html>
    <body>
    <div id="replace">This gets replaced</div>

    <script type="text/javascript">
      document.getElementById("replace").innerHTML = "yolo";
    </script>
    </body>
    </html>
    """

    html = HTML(html=doc)
    html.render()

    assert html.find('#replace', first=True).text == 'yolo'
示例#19
0
def test_bare_render():
    doc = """<a href='https://httpbin.org'>"""
    html = HTML(html=doc)
    script = """
        () => {
            return {
                width: document.documentElement.clientWidth,
                height: document.documentElement.clientHeight,
                deviceScaleFactor: window.devicePixelRatio,
            }
        }
    """
    val = html.render(script=script, reload=False)
    for value in ('width', 'height', 'deviceScaleFactor'):
        assert value in val

    assert html.find('html')
    assert 'https://httpbin.org' in html.links
示例#20
0
        def parse(text, user_profile_url):
            html = HTML(html=text)
            # find persona_name
            div = html.find("div.profile_header_centered_persona", first=True)
            if not div:
                fallback_div = html.find("div.welcome_header_ctn")
                if fallback_div:
                    logger.info("Fresh account without set up steam profile.")
                    raise UnfinishedAccountSetup()
                logger.error(
                    "Can not parse backend response - no div.profile_header_centered_persona"
                )
                raise UnknownBackendResponse()
            span = div.find("span.actual_persona_name", first=True)
            if not span:
                logger.error(
                    "Can not parse backend response - no span.actual_persona_name"
                )
                raise UnknownBackendResponse()
            persona_name = span.text

            # find steam id
            variable = 'g_steamID = "'
            start = text.find(variable)
            if start == -1:
                logger.error(
                    "Can not parse backend response - no g_steamID variable")
                raise UnknownBackendResponse()
            start += len(variable)
            end = text.find('";', start)
            steam_id = text[start:end]

            # find miniprofile id
            profile_link = f'{user_profile_url}" data-miniprofile="'
            start = text.find(profile_link)
            if start == -1:
                logger.error(
                    "Can not parse backend response - no steam profile href")
                raise UnknownBackendResponse()
            start += len(profile_link)
            end = text.find('">', start)
            miniprofile_id = text[start:end]

            return steam_id, miniprofile_id, persona_name
示例#21
0
def get_root():
    res = requests.get(root_url)
    text = res.text
    html_page = HTML(html=text, url=root_url)
    links = html_page.links
    pattern = '\?start=.+'
    for link in links:
        if re.search(pattern, link):
            pages_list.append(link)
    parse_page(text)
示例#22
0
def get_questions(tag: str, days: int):
    to_date = datetime.date.today()
    from_date = to_date - datetime.timedelta(days)
    url = "https://api.stackexchange.com/docs/questions"
    params = {
        'order': 'desc',
        'sort': 'activity',
        'filter': 'default',
        'tagged': 'python',
        'site': 'stackoverflow',
        'fromdate': f'{from_date}',
        'todate': f'{to_date}'
    }
    response = requests.get(url, params=params)
    html_str = response.text
    html = HTML(html=html_str)
    questions_summaries = html.find('.question-summary')
    print(questions_summaries)
    print(html_str)
示例#23
0
def get_shows_html(doc):
    html = HTML(html=doc)

    # deduplicate movie names as they could be repeated across the text
    content = set()

    for elem in html.find('font[style*="italic"]'):
        title = elem.text.strip()
        if title and title != ".":
            if ',' in title:
                names = title.split(',')
                for name in names:
                    name = name.strip()
                    if name:
                        content.add(name)
            else:
                content.add(title.strip())

    return content
示例#24
0
async def main(urlPamar):
    res = await request(urlPamar)
    res = HTML(html=res)
    about = res.find('div.list-item-desc-top a')
    for i, title in enumerate(about):
        nextUrl = f'https:{title.attrs["href"]}'
        nextHtml = await request(nextUrl)
        print(f'{i + 1} [{title.text}](https:{title.attrs["href"]})')
        try:
            startIndex = nextHtml.index(f'"address":')
            endIndex = nextHtml.index(f',"extraInfos"')
            print(nextHtml[startIndex:endIndex])
        except ValueError:
            if nextHtml.__contains__("验证中心"):
                print("需要验证")
            else:
                print(nextHtml)
        finally:
            pass
示例#25
0
    def get_news_url_list(pages: int) -> list:
        '''Gets all news url'''

        url_news = "/nba/news"
        r = session.get(url=url_base + url_news, headers=headers)

        links = []
        while pages > 0:
            html = HTML(html=r.text)
            news_list_body = html.find('#news_list_body', first=True)
            links.extend(list(news_list_body.links))

            page_link_next = html.find('div > gonext > a[data-id=right]',
                                       first=True).attrs['href']

            r = session.get(url=url_base + page_link_next, headers=headers)
            pages += -1

        return links
示例#26
0
def parse_movies_from_url(url):
    data = []
    headers = []

    html_txt = url_to_html(url)
    r_html = HTML(html=html_txt)
    r_table = r_html.find('#table')

    if len(r_table) > 0:
        parsed_table = r_table[0]
        rows = parsed_table.find('tr')
        headers = [header.text for header in rows[0].find('th')]
        for row in rows[1:]:
            columns = row.find('td')
            row_data = []
            for column in columns:
                row_data.append(column.text)
            data.append(row_data)
    return (headers, data)
示例#27
0
    def description_urls(self):

        pages = self.__find_pages__(INDEX_PAGE_URL)
        urls = OrderedDict()
        for index_page in pages:
            self.__log__.info('Processing page: %s', index_page)
            r = self.__session__.get(index_page)
            doc = HTML(html=r.html.find('html', first=True).html)
            self.__log__.info('  -> start parsing HTML')
            for t in doc.find('.col_premierecondition'):
                spans = t.find('span')
                for s in spans:
                    if __is_class__(s, 'lien') and 'Fi' in s.text:
                        item = s.text.replace('-', '').strip()
                        url = s.find('a', first=True).attrs['href']
                        self.__log__.info('%s -> %s', item, url)
                        urls[item] = BASE_URL.format(url)

        return urls
示例#28
0
    def parse(self, response):

        # url编码
        url = urllib.parse.unquote(response.url).strip()

        if str(response.url).find("error.html") != -1:  # 如果当前页面是空那么直接返回即可
            return
        # 因为是按照view遍历,而返回的是item,所以需要先判断是不是在已经存储的url里面防止重复写入,如果已经抓取过,直接返回
        if response.url in self.urlGettedSet:
            return

        html = HTML(html=response.text)  # 将返回的response转换为request-html能解析的方式
        list1 = html.find('.lemmaWgt-subLemmaListTitle')
        # polysemantList = html.find('.polysemantList-wrapper,cmn-clearfix', first=True)
        # 如果只是有多义词列表
        if list1:
            lemmaWgtElement = html.find(".custom_dot,para-list,list-paddingleft-1", first=True)
            urlList = baikeLinkExtractor1(lemmaWgtElement)  # 获取同义词连接
            for link in urlList:
                if link not in self.urlGettedSet:
                    req = scrapy.http.request.Request(link, callback=self.parse)
                    yield req
        else:
            # 如果有同义词连接,那么就提取所有百科连接进行
            print(response)
            urlList = baikeLinksExtractor(html)
            for link in urlList:
                # 从网页中拿到的连接是item所以不再需要判断
                if link not in self.urlGettedSet:
                    req = scrapy.http.request.Request(link, callback=self.parse)
                    yield req
                    # 1、需要将当前页面的url和html页面给写入文件
            filename = re.sub("[/?&=#.\"'\\:*<>\|]", "_", url.split("/", 4)[-1])  # 将url中的特殊字符给替换为下划线
            fitem = FileItem()
            # 当前程序访问过的url还需要加入已经访问的set吗,实际不需要,因为在一次运行中,不会重复解析,但需要写入文件夹方便下次读出这个
            fitem['Name'] = filename + ".txt"
            fitem['Content'] = str(html.html)
            # print(str(html.text))
            yield fitem

            urlItem = UrlItem()
            urlItem['url'] = response.url
            yield urlItem
示例#29
0
def _get_posts(path, pages=10, timeout=5, sleep=0, credentials=None):
    """Gets posts for a given account."""
    global _session, _timeout

    url = f'{_base_url}/{path}'

    _session = HTMLSession()
    _session.headers.update(_headers)

    if credentials:
        _login_user(*credentials)

    _timeout = timeout
    response = _session.get(url, timeout=_timeout)
    html = HTML(html=response.html.html.replace('<!--','').replace('-->',''))
    cursor_blob = html.html

    while True:
        for article in html.find('article'):
            yield _extract_post(article)

        pages -= 1
        if pages == 0:
            return

        cursor = _find_cursor(cursor_blob)
        next_url = f'{_base_url}{cursor}'

        if sleep:
            time.sleep(sleep)

        try:
            response = _session.get(next_url, timeout=timeout)
            response.raise_for_status()
            data = json.loads(response.text.replace('for (;;);', '', 1))
        except (RequestException, ValueError):
            return

        for action in data['payload']['actions']:
            if action['cmd'] == 'replace':
                html = HTML(html=action['html'], url=_base_url)
            elif action['cmd'] == 'script':
                cursor_blob = action['code']
def _get_group_posts(path,
                     pages=10,
                     timeout=5,
                     sleep=0,
                     credentials=None,
                     extra_info=False):
    """Gets posts for a given account."""
    global _session, _timeout

    url = f'{_base_url}/{path}'

    _session = HTMLSession()
    _session.headers.update(_headers)

    if credentials:
        _login_user(*credentials)

    _timeout = timeout

    while True:
        response = _session.get(url, timeout=_timeout)
        response.raise_for_status()
        html = HTML(
            html=response.html.html.replace('<!--', '').replace('-->', ''))
        cursor_blob = html.html

        for article in html.find('article'):
            post = _extract_post(article)
            if extra_info:
                post = fetch_share_and_reactions(post)
            yield post

        pages -= 1
        if pages <= 0:
            return

        cursor = _find_cursor(cursor_blob)

        if cursor is not None:
            url = f'{_base_url}{cursor}'

        if sleep:
            time.sleep(sleep)
示例#31
0
def get_script_sources(url: str, body: str) -> List[str]:
    """Get script sources

    Arguments:
        url {str} -- A URL
        body {str} -- An HTTP response body

    Returns:
        List[str] -- A list of script sources
    """
    html = HTML(html=body)

    sources: List[str] = []
    for script in html.find("script"):
        source = script.attrs.get("src")
        if source is not None:
            sources.append(normalize_source(url, source))

    return list(set(sources))
示例#32
0
class HTMLVotesParser:
    def __init__(self, html):
        self.html = HTML(html=html)
        self.date = None
        self.date_votes = None
        self.topic = None
        self.kind = None

    def next_td(self):
        for tr in self.html.find("tr"):
            for td in tr.find("td"):
                classes = td.attrs.get("class", ())
                yield td, classes

    def parse(self) -> VoteList:
        student = ""
        dates = []
        for td, classes in self.next_td():
            text = td.text
            if self._is_student(classes):
                student = td.find("span")[2].text
            if self._is_new_day(classes):
                self._init_new_day(dates, text)

            elif self._is_processing_day():
                self._process_day(classes, text)
        if self.date:
            dates.append((self.date, self.date_votes))
        return VoteList(student=student, votes=dates)

    def _process_day(self, classes, text):
        if "intestazioni" in classes:
            if not self.topic:
                self.topic = text
            else:
                self.kind = text
        elif "voto_" in classes:
            vote = Vote(self.topic, self.kind, text)
            self.topic = None
            self.date_votes.append(vote)

    def _is_processing_day(self):
        return self.date is not None

    def _init_new_day(self, dates, new_date):
        if self.date:
            dates.append((self.date, self.date_votes))
        self.date = new_date
        self.date_votes = []

    def _is_new_day(self, classes):
        return "registro" in classes

    def _is_student(self, classes):
        return "page-usr-user" in classes
示例#33
0
文件: commands.py 项目: kylef/goji
def submit_form(session, response, data=None):
    from requests_html import HTML
    html = HTML(url=response.url, html=response.text)

    forms = html.find('form')
    if len(forms) == 0:
        raise Exception('Page does have any forms')

    form = forms[0]
    url = form.attrs['action']
    fields = form.find('input')

    data = data or {}

    for field in fields:
        name = field.attrs['name']

        if name not in data:
            value = field.attrs['value']
            data[name] = value

    response = session.post(urljoin(response.url, url), data=data)
    return response
示例#34
0
def _weblint_html(path: pathlib.Path, doctype: str) -> set:
    '''HTML Lint for WebLint.
    '''

    DEPRECATED_TAGS = {
        'font', 'center', 's', 'strike', 'b', 'i', 'tt', 'small', 'frame',
        'frameset', 'noframes', 'acronym', 'big', 'u', 'isindex', 'basefont',
        'dir', 'applet', 'style',
    }

    REQUIRED_TAGS = {
        'html': (
            (('head', '==', 1), 'HS0013'),
            (('body', '==', 1), 'HS0014'),
        ),
        'head': (
            (('title', '==', 1), 'HS0015'),
            (('meta', '>=', 1), 'HS0018'),
            (('script', '==', 0), 'HP0001'),
        ),
        'ul': (
            (('li', '>=', 1), 'HS0019'),
        ),
        'ol': (
            (('li', '>=', 1), 'HS0020'),
        ),
        'select': (
            (('option', '>=', 1), 'HS0021'),
        ),
        'dl': (
            (('dt', '>=', 1), 'HS0022'),
            (('dd', '>=', 1), 'HS0023'),
        ),
        'video': (
            (('source', '>=', 1), 'HS0024'),
        ),
        'audio': (
            (('source', '>=', 1), 'HS0026'),
        ),
        'details': (
            (('summary', '==', 1), 'HS0029'),
        ),
        'aside': (
            (('main', '==', 0), 'HA0006'),
        ),
        'figure': (
            (('figcaption', '==', 1), 'HS0044'),
        ),
    }

    SELFCLOSED_TAGS = {
        'area', 'base', 'br', 'embed', 'hr', 'iframe', 'input', 'img', 'keygen',
        'link', 'meta', 'output', 'param', 'track', 'wbr', 'source',
    }

    CLOSE_TAGS = {
        'a', 'abbr', 'address', 'article', 'aside', 'audio',
        'bdi', 'bdo', 'blockquote', 'body', 'button',
        'canvas', 'caption', 'cite', 'code', 'col', 'colgroup',
        'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', 'div',
            'dl', 'dt',
        'em',
        'fieldset', 'figure', 'figcaption', 'footer', 'form',
        'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'header', 'hgroup', 'html',
        'ins',
        'kbd',
        'label', 'legend', 'li',
        'main', 'map', 'menu', 'menuitem', 'meter',
        'nav', 'noscript',
        'object', 'ol', 'option', 'optgroup',
        'p', 'picture', 'pre', 'progress',
        'q',
        'rb', 'rp', 'rt', 'rtc', 'ruby',
        'samp', 'script', 'section', 'select', 'span', 'strong',
            'sub',  'summary', 'sup',
        'table', 'textarea', 'tbody', 'td', 'template', 'th', 'thead', 'time',
            'title', 'tfoot', 'tr',
        'ul',
        'var', 'video'
    }

    DEPRECATED_ATTRS = {
        'style', 'manifest', 'xmlns', 'align', 'alink', 'link', 'vlink',
        'text', 'background', 'bgcolor', 'border', 'char', 'charoff',
        'compact', 'frame', 'frameborder', 'hspace', 'nowrap', 'rules',
        'valign', 'accept', 'vspace',
    }

    GLOBAL_ATTRS = {
        'lang', 'id', 'class', 'title', 'hidden',
    }

    VALID_ATTRS = {
        'charset', 'name', 'src', 'content', 'controls', 'type', 'href',
        'alt', 'rel', 'value', 'min', 'max',
    }

    BOOL_ATTRS = {
        'controls', 'hidden',
    }

    REQUIRED_ATTRS = {
        'html': (('lang',), 'HS0012'),
        'video': (('controls',), 'HS0027'),
        'source': (('src', 'type'), 'HS0025'),
        'audio': (('controls',), 'HS0028'),
        'a': (('href',), 'HS0031'),
        'img': (('src',), 'HS0033'),
        'input': (('type',), 'HS0035'),
        'link': (('rel', 'href'), 'HS0040'),
        'script': (('src',), 'HS0042'),
        'progress': (('value', 'max'), 'HS0045'),
        'meter': (('value', 'min', 'max'), 'HS0046'),
    }

    REQUIRED_ATTRS_ACCESS = {
        'img': (('alt',), 'HA0001'),
        'a': (('title',), 'HA0007'),
    }

    NOEMPTY_TAGS = {
        ('title', 'HS0016'),
        ('p', 'HS0017'),
        ('summary', 'HS0030'),
        ('a', 'HS0032'),
        ('video', 'HA0002'),
        ('audio', 'HA0003'),
        ('h1', 'HS0036'),
        ('h2', 'HS0036'),
        ('h3', 'HS0036'),
        ('h4', 'HS0036'),
        ('h5', 'HS0036'),
        ('h6', 'HS0036'),
        ('meter', 'HA0008'),
    }

    class _StdHTMLParser(HTMLParser):
        def handle_decl(self, data):
            self.doctype = data
            self.not_paired_tags = []
            self._start_tags = []
            self.duplicated_attrs = []
            self.tag_not_lowercase = []
            self.empty_tags_not_closed = []

        def handle_starttag(self, tag, attrs):

            # tag name must be in lowercase
            # Python standard module "html.parser" covert tag name from uppercase
            # to lowercase already.
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self._start_tags.append(tag)
            else:
                self.empty_tags_not_closed.append((tag, self.lineno))
            self._handle_attrs(attrs)

        def handle_endtag(self, tag):
            if tag == self._start_tags[-1]:
                self._start_tags.pop()
            else:
                if tag not in self._start_tags:
                    self.not_paired_tags.append((tag, self.lineno))
                else:
                    for t in reversed(self._start_tags):
                        if t != tag:
                            self.not_paired_tags.append((t, self.lineno))
                        else:
                            self._start_tags.pop()
                            break

        def handle_startendtag(self, tag, attrs):
            # tag name must be in lowercase
            rawtag = self._raw_tag()
            if not rawtag.islower():
                self.tag_not_lowercase.append((rawtag, self.lineno))

            if tag not in SELFCLOSED_TAGS:
                self.not_paired_tags.append((tag, self.lineno))
            self._handle_attrs(attrs)

        def _handle_attrs(self, attrs):
            attrnames = [a[0] for a in attrs]
            for a in attrs:
                name, _ = a

                # validate duplicated attributes
                c = attrnames.count(name)
                if c > 1 and (f'{name} {c}', self.lineno) not in self.duplicated_attrs:
                    self.duplicated_attrs.append((f'{name} {c}', self.lineno))

        def _raw_tag(self):
            lineno, pos = self.getpos()
            rawline = self.rawdata.splitlines()[lineno-1]
            return rawline[pos+1:pos+1+len(self.lasttag)]

    try:
        with path.open() as f:
            doc = f.read()
    except FileNotFoundError:
        return {Report('G00001', path, 0, '')}
    reports = set()

    # validate DOCTYPE, using standard HTML parser since
    # requests-html ignore handling the DOCTYPE
    lineno = 1
    obj = 'DOCTYPE'
    std_parser = _StdHTMLParser()
    std_parser.feed(doc)
    try:
        if std_parser.doctype != doctype:
            reports.add(Report('HS0002', path, lineno, obj))
            return reports

        rules = {
            'not_paired_tags': 'HS0005',
            'empty_tags_not_closed': 'HS0006',
            'duplicated_attrs': 'HS0009',
            'tag_not_lowercase': 'HS0010',
        }
        for a, e in rules.items():
            # no need to check attr exists,
            # since doctype has been checked before
            for t in getattr(std_parser, a):
                reports.add(Report(e, path, t[1], t[0]))

    except AttributeError:
        reports.add(Report('HS0001', path, lineno, obj))
        return reports
    finally:
        std_parser.close()

    all_ids = set()
    parser = HTML(html=doc)
    for element in parser.find():
        lxml_element = element.element
        tag = lxml_element.tag
        lineno = lxml_element.sourceline
        if tag in DEPRECATED_TAGS:
            reports.add(Report('HS0004', path, lineno, tag))
        elif tag not in CLOSE_TAGS | SELFCLOSED_TAGS:
            reports.add(Report('HS0003', path, lineno, tag))
        else:
            pass
        
        # validate required elements
        rules = REQUIRED_TAGS.get(tag)
        if rules is not None:
            for r in rules:
                if eval(f'not len(element.find(r[0][0])) {r[0][1]} r[0][2]'):
                    reports.add(Report(r[1], path, lineno, r[0][0]))

        # validate required attributes
        rules = REQUIRED_ATTRS.get(tag)
        if rules is not None:
            for r in rules[0]:
                if r not in (a.lower() for a in element.attrs):
                    reports.add(Report(rules[1], path, lineno, r))

        # validate accessibility attributes
        rules = REQUIRED_ATTRS_ACCESS.get(tag)
        if rules is not None:
            for r in rules[0]:
                if r not in (a.lower() for a in element.attrs):
                    reports.add(Report(rules[1], path, lineno, r))

        # parse attributes
        for a, v in element.attrs.items():
            a_lower = a

            # validate attribute name must be in lowercase
            if not a.islower():
                reports.add(Report('HS0011', path, lineno, a))
                a_lower = a.lower()

            if a_lower in DEPRECATED_ATTRS:
                reports.add(Report('HS0008', path, lineno, a))
            elif a_lower not in GLOBAL_ATTRS | VALID_ATTRS:
                reports.add(Report('HS0007', path, lineno, a))
            
            # validate attribute's value is NOT empty
            if not v and a_lower not in BOOL_ATTRS:
                reports.add(Report('HS0034', path, lineno, a))

            if a_lower == 'id':
                if v in all_ids:
                    reports.add(Report('HS0037', path, lineno, f'id="{v}"'))
                all_ids.add(v)

    for t in NOEMPTY_TAGS:
        for e in parser.find(t[0]):
            if not e.text:
                reports.add(Report(t[1], path, e.element.sourceline, e.element.tag))

    # `<h1>` element must be present only once
    h1_list = parser.find('h1')
    if len(h1_list) > 1:
        e = h1_list[-1].element
        reports.add(Report('HA0004', path, e.sourceline, e.tag))

    # <main> element without "hidden" attribute must be present only once
    main_list = parser.find('main')
    main_count = len(main_list)
    main_hidden_count = len(parser.find('main[hidden]'))
    if main_count - main_hidden_count != 1:
        for e in main_list:
            reports.add(Report('HS0038', path, e.element.sourceline, 'main'))

    # <meta> element with "charset" attribute must be present only once
    meta_charset_list = parser.find('meta[charset]')
    meta_charset_count = len(meta_charset_list)
    if not meta_charset_count:
        reports.add(Report('HS0018', path, 0, 'meta charset'))
    elif meta_charset_count > 1:
        for e in meta_charset_list:
            obj = f'meta charset {meta_charset_count}'
            reports.add(Report('HS0009', path, e.element.sourceline, obj))

    # <input> element with "type=image" must have "src" and "alt" atrributes
    for e in parser.find('input[type="image"]'):
        if 'src' not in e.attrs:
            reports.add(Report('HS0039', path, e.element.sourceline, 'src'))
        if 'alt' not in e.attrs:
            reports.add(Report('HA0005', path, e.element.sourceline, 'alt'))

    # <link> element must **NOT** have `type` attribute with value of `text/css`
    for e in parser.find('link[rel="stylesheet"]'):
        assert 'href' in e.attrs
        if e.attrs['href'].endswith('css'):
            if 'type' in e.attrs and e.attrs['type'] == 'text/css':
                l = e.element.sourceline
                reports.add(Report('HS0041', path, l, 'type'))

    # <script> element must **NOT** have `type` attribute with value of `text/javascript`
    for e in parser.find('script[type="text/javascript"]'):
        l = e.element.sourceline
        reports.add(Report('HS0043', path, l, 'type'))

    return reports