예제 #1
0
 def __get_level__(child: PyQuery) -> int:
     if child.is_('h1'):
         return 1
     elif child.is_('h2'):
         return 2
     elif child.is_('h3'):
         return 3
     elif child.is_('h4'):
         return 4
     else:
         return 5
def test_redirect(base_url, selenium, method, uri):
    HEADING_TEXT = 'Maintenance Mode'
    HEADING_SELECTOR = '#content-main > h1'
    MM_URL_TEMPLATE = '{locale}/maintenance-mode'

    locale = 'en-US'

    url = urljoin(base_url, uri.format(locale=locale))

    if method.lower() == 'get':
        # We do a get on the given URL but wait for the
        # maintenance-mode page to load via redirection.
        selenium.get(url)
        mm_page = BasePage(selenium, base_url, locale=locale)
        mm_page.URL_TEMPLATE = MM_URL_TEMPLATE
        mm_page.wait_for_page_to_load()
        mm_heading = mm_page.find_element(By.CSS_SELECTOR, HEADING_SELECTOR)
        assert mm_heading.is_displayed()
        assert HEADING_TEXT in mm_heading.text
        assert mm_page.is_maintenance_mode_banner_displayed
        assert not mm_page.header.is_signin_displayed
    else:
        request_method = getattr(requests, method.lower())
        resp = request_method(url, allow_redirects=True)
        # The final response should be a successful load of the
        # maintenance-mode page in the given locale.
        assert resp.status_code == 200
        assert resp.url == urljoin(base_url,
                                   MM_URL_TEMPLATE.format(locale=locale))
        pq = PyQuery(resp.text)
        assert HEADING_TEXT in pq(HEADING_SELECTOR).text()
        assert (BasePage.MM_BANNER_TEXT
                in pq(BasePage.MM_BANNER_SELECTOR).text())
        assert not pq.is_(BasePage.Header.SIGNIN_SELECTOR)
def test_redirect(base_url, selenium, method, uri):
    HEADING_TEXT = 'Maintenance Mode'
    HEADING_SELECTOR = '#content-main > h1'
    MM_URL_TEMPLATE = '{locale}/maintenance-mode'

    locale = 'en-US'

    url = urljoin(base_url, uri.format(locale=locale))

    if method.lower() == 'get':
        # We do a get on the given URL but wait for the
        # maintenance-mode page to load via redirection.
        selenium.get(url)
        mm_page = BasePage(selenium, base_url, locale=locale)
        mm_page.URL_TEMPLATE = MM_URL_TEMPLATE
        mm_page.wait_for_page_to_load()
        mm_heading = mm_page.find_element(By.CSS_SELECTOR, HEADING_SELECTOR)
        assert mm_heading.is_displayed()
        assert HEADING_TEXT in mm_heading.text
        assert mm_page.is_maintenance_mode_banner_displayed
        assert not mm_page.header.is_signin_displayed
    else:
        request_method = getattr(requests, method.lower())
        resp = request_method(url, allow_redirects=True)
        # The final response should be a successful load of the
        # maintenance-mode page in the given locale.
        assert resp.status_code == 200
        assert resp.url == urljoin(base_url,
                                   MM_URL_TEMPLATE.format(locale=locale))
        pq = PyQuery(resp.text)
        assert HEADING_TEXT in pq(HEADING_SELECTOR).text()
        assert (BasePage.MM_BANNER_TEXT in
                pq(BasePage.MM_BANNER_SELECTOR).text())
        assert not pq.is_(BasePage.Header.SIGNIN_SELECTOR)
예제 #4
0
def is_div(partial, cls_name=None, id_name=None):
    """Helper function to detect if we have a well formated div partial.

    Params:
        partial (str): an HTML content (partial HTML code) page to test.
        class_name (str|None): if not `None` the name of the class that the div
            in `partial` must have.
        id_name (str|None): if not `None` the name of the id that the div
            in `partial` must have.

    Returns:
        bool: True if `partial` is a well formated div page with the provided
            class (if provided) and id (if provided), False if not.

    Examples:
        >>> is_div("<div>Plop</div>")
        True

        >>> is_div("<span>Plop</span>")
        False

        >>> is_div("<!DOCTYPE html><html>Hello</html>")
        False

        >>> is_div('<div class="useful">Plop</div>', "useful")
        True

        >>> is_div('<div class="useless">Plop</div>', "useful")
        False

        >>> is_div('<div class="useful" id="cat">Plop</div>', "useful", "cat")
        True

        >>> is_div('<div class="useful" id="dog">Plop</div>', "useful", "cat")
        False

        >>> is_div('<div class="useful">Plop</div>', "useful", "cat")
        False
    """
    d = PyQuery(partial)

    div_ok = d.is_("div")
    cls_ok = d.has_class(cls_name) if cls_name else True
    id_ok = d.is_("#%s" % id_name) if id_name else True

    return div_ok and cls_ok and id_ok
예제 #5
0
 def __is_(child: PyQuery, tag_list: List[str]) -> bool:
     """
     Return true if the element tag is
     :param tag_list: list of tags name
     :param child: pyquery object
     :return:
     """
     for tag in tag_list:
         if child.is_(tag):
             return True
     return False
예제 #6
0
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict:
    # Find if has children
    elem = PyQuery(elem)
    children = list(elem.contents())
    has_children = len(elem.children()) > 0

    contents = []
    if has_children:
        # Fix unwrapped children
        if not already_wrapped:
            children = fix_unwrapped_text(elem).contents()

        for child in children:
            child_dict = build_dict_from_sane_json(child, already_wrapped=True)
            if child_dict:
                contents.append(child_dict)
    else:
        contents = elem.html()

    extra = {}

    # Only tables need the HTML (to use later for extraction of relevant data)
    if elem.is_("table"):
        extra = {'original_html': str(elem)}

    if 'src' in elem[0].attrib:
        extra['src'] = elem.attr('src')
    if 'href' in elem[0].attrib:
        extra['href'] = elem.attr('href')

    tag_type = list(elem)[0].tag
    tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type)
    contents = PRE_CONTENTS_MATCH.get(tag_type, contents)

    return {
        'type': tag_type_mapped,
        'attrs': [],
        'layout': {},
        'contents': contents,
        'extra': extra
    }
예제 #7
0
파일: forvo.py 프로젝트: hrdrq/dictionary
 def parse_items(self, urls):
     docs = []
     threads = [
         threading.Thread(target=get, args=(url, docs)) for url in urls
     ]
     for thread in threads:
         thread.start()
     for thread in threads:
         thread.join()
     for item_doc in docs:
         word_id = None
         match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]",
                           item_doc.html())
         if match:
             word_id = match.group(2)
         for locale in item_doc("article.pronunciations"):
             locale = PyQuery(locale)
             lang_header = locale('header[id=%s]' % self.lang.split('_')[0])
             if lang_header:
                 word = re.compile(r"(.*) の発音").search(
                     lang_header.text()).group(1)
                 if self.lang == 'en_usa':
                     els = locale('header[id=%s]' % self.lang).next_all()
                 else:
                     els = locale('.show-all-pronunciations li')
                 lis = []
                 for el in els:
                     el = PyQuery(el)
                     if el.has_class('li-ad'):
                         continue
                     if el.is_('header'):
                         break
                     lis.append(el)
                 for li in lis:
                     i = PyQuery(li('span.play'))
                     text = i.parents('li').eq(0).text()
                     user = None
                     match = re.search("発音したユーザ: (.*) \(", text)
                     if match:
                         user = match.group(1)
                     onclick = i.attr('onclick')
                     match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)"
                                        ).search(onclick)
                     if match:
                         code = match.group(1)
                         url = 'https://audio00.forvo.com/mp3/' + \
                             base64_decode(code)
                         self.results.append({
                             'word': word,
                             'url': url,
                             'word_id': word_id,
                             'user': user
                         })
                     else:
                         match = re.compile(
                             r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick)
                         if match:
                             code = match.group(1)
                             url = 'https://audio00.forvo.com/phrases/mp3/' + \
                                 base64_decode(code)
                             self.results.append({
                                 'word': word,
                                 'url': url,
                                 'word_id': word_id,
                                 'user': user
                             })
    def _render_element(self,
                        p: Paragraph,
                        element: str or Element,
                        is_root=False,
                        bold=False,
                        italic=False,
                        strike=False,
                        underline=False,
                        font_size=None,
                        sup=False,
                        sub=False):
        """
        转换html节点到word
        :param element:
        :return:
        """
        if isinstance(element, str):
            run = p.add_run(self._clear_text(element))
            run.bold = bold
            run.italic = italic
            run.font.strike = strike
            run.font.underline = underline
            run.font.subscript = sub
            run.font.superscript = sup
            if font_size:
                run.font.size = font_size
            self.__force_simsun(run)
            return
        pq = PyQuery(element)
        if pq.is_('p'):  # 不支持嵌套p,自动扁平化
            contents = pq.contents()
            align = self._get_pq_style(pq, 'text-align')

            if align == 'center':
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            elif align == 'right':
                p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
            else:
                p.alignment = WD_ALIGN_PARAGRAPH.LEFT

            if is_root:
                self._render_children(p, contents)
            else:
                sub_p = p._parent.add_paragraph()

                if align == 'center':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                elif align == 'right':
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                else:
                    sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT

                self._render_children(sub_p, contents)
        elif pq.is_('u'):  # 下划线
            self.__render_inline_element(p,
                                         pq,
                                         underline=True,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('strong') or pq.is_('b'):  # 加粗
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=True,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('i') or pq.is_('em'):  # 斜体
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('sub'):  # 下标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=True,
                                         sup=sup)
        elif pq.is_('sup'):  # 上标
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=italic,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=True)
        elif pq.is_('var'):  # 老公式
            self.__render_inline_element(p,
                                         pq,
                                         underline=underline,
                                         bold=bold,
                                         italic=True,
                                         strike=strike,
                                         font_size=font_size,
                                         sub=sub,
                                         sup=sup)
        elif pq.is_('span'):
            self._render_span(p,
                              pq,
                              bold=bold,
                              italic=italic,
                              strike=strike,
                              underline=underline,
                              font_size=font_size)
        elif pq.is_("br"):
            p.add_run().add_break()
        elif pq.is_("div"):
            # sub_p = p._parent.add_paragraph()
            p.add_run().add_break()
            self._render_children(p, pq.contents())
        elif pq.is_('ul'):
            self._render_unorder_list(p, pq)
        elif pq.is_('ol'):
            self._render_order_list(p, pq)
        elif pq.is_('table'):
            self._render_table(p, pq)
        elif pq.is_('img'):  # 图片
            self._render_img(p, pq)
        elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
            sub_p = p._parent.add_paragraph()
            self.__render_inline_element(sub_p,
                                         pq,
                                         bold=True,
                                         font_size=Pt(12),
                                         underline=underline,
                                         italic=True,
                                         strike=strike,
                                         sub=sub,
                                         sup=sup)
        else:
            sub_p = p._parent.add_paragraph()
            contents = pq.contents()
            self._render_children(sub_p, contents)
예제 #9
0
def is_pq_object_visible(el: PyQuery):
    if el.is_("script") or el.is_("noscript") or el.is_("style"):
        return False
    else:
        return True