def __get_level__(child: PyQuery) -> int: if child.is_('h1'): return 1 elif child.is_('h2'): return 2 elif child.is_('h3'): return 3 elif child.is_('h4'): return 4 else: return 5
def test_redirect(base_url, selenium, method, uri): HEADING_TEXT = 'Maintenance Mode' HEADING_SELECTOR = '#content-main > h1' MM_URL_TEMPLATE = '{locale}/maintenance-mode' locale = 'en-US' url = urljoin(base_url, uri.format(locale=locale)) if method.lower() == 'get': # We do a get on the given URL but wait for the # maintenance-mode page to load via redirection. selenium.get(url) mm_page = BasePage(selenium, base_url, locale=locale) mm_page.URL_TEMPLATE = MM_URL_TEMPLATE mm_page.wait_for_page_to_load() mm_heading = mm_page.find_element(By.CSS_SELECTOR, HEADING_SELECTOR) assert mm_heading.is_displayed() assert HEADING_TEXT in mm_heading.text assert mm_page.is_maintenance_mode_banner_displayed assert not mm_page.header.is_signin_displayed else: request_method = getattr(requests, method.lower()) resp = request_method(url, allow_redirects=True) # The final response should be a successful load of the # maintenance-mode page in the given locale. assert resp.status_code == 200 assert resp.url == urljoin(base_url, MM_URL_TEMPLATE.format(locale=locale)) pq = PyQuery(resp.text) assert HEADING_TEXT in pq(HEADING_SELECTOR).text() assert (BasePage.MM_BANNER_TEXT in pq(BasePage.MM_BANNER_SELECTOR).text()) assert not pq.is_(BasePage.Header.SIGNIN_SELECTOR)
def is_div(partial, cls_name=None, id_name=None): """Helper function to detect if we have a well formated div partial. Params: partial (str): an HTML content (partial HTML code) page to test. class_name (str|None): if not `None` the name of the class that the div in `partial` must have. id_name (str|None): if not `None` the name of the id that the div in `partial` must have. Returns: bool: True if `partial` is a well formated div page with the provided class (if provided) and id (if provided), False if not. Examples: >>> is_div("<div>Plop</div>") True >>> is_div("<span>Plop</span>") False >>> is_div("<!DOCTYPE html><html>Hello</html>") False >>> is_div('<div class="useful">Plop</div>', "useful") True >>> is_div('<div class="useless">Plop</div>', "useful") False >>> is_div('<div class="useful" id="cat">Plop</div>', "useful", "cat") True >>> is_div('<div class="useful" id="dog">Plop</div>', "useful", "cat") False >>> is_div('<div class="useful">Plop</div>', "useful", "cat") False """ d = PyQuery(partial) div_ok = d.is_("div") cls_ok = d.has_class(cls_name) if cls_name else True id_ok = d.is_("#%s" % id_name) if id_name else True return div_ok and cls_ok and id_ok
def __is_(child: PyQuery, tag_list: List[str]) -> bool: """ Return true if the element tag is :param tag_list: list of tags name :param child: pyquery object :return: """ for tag in tag_list: if child.is_(tag): return True return False
def build_dict_from_sane_json(elem: PyQuery, already_wrapped=False) -> dict: # Find if has children elem = PyQuery(elem) children = list(elem.contents()) has_children = len(elem.children()) > 0 contents = [] if has_children: # Fix unwrapped children if not already_wrapped: children = fix_unwrapped_text(elem).contents() for child in children: child_dict = build_dict_from_sane_json(child, already_wrapped=True) if child_dict: contents.append(child_dict) else: contents = elem.html() extra = {} # Only tables need the HTML (to use later for extraction of relevant data) if elem.is_("table"): extra = {'original_html': str(elem)} if 'src' in elem[0].attrib: extra['src'] = elem.attr('src') if 'href' in elem[0].attrib: extra['href'] = elem.attr('href') tag_type = list(elem)[0].tag tag_type_mapped = PRE_TAG_MATCH.get(tag_type, tag_type) contents = PRE_CONTENTS_MATCH.get(tag_type, contents) return { 'type': tag_type_mapped, 'attrs': [], 'layout': {}, 'contents': contents, 'extra': extra }
def parse_items(self, urls): docs = [] threads = [ threading.Thread(target=get, args=(url, docs)) for url in urls ] for thread in threads: thread.start() for thread in threads: thread.join() for item_doc in docs: word_id = None match = re.search("notSatisfied(Lang)?\( ?'(\d+)' ?[,\)]", item_doc.html()) if match: word_id = match.group(2) for locale in item_doc("article.pronunciations"): locale = PyQuery(locale) lang_header = locale('header[id=%s]' % self.lang.split('_')[0]) if lang_header: word = re.compile(r"(.*) の発音").search( lang_header.text()).group(1) if self.lang == 'en_usa': els = locale('header[id=%s]' % self.lang).next_all() else: els = locale('.show-all-pronunciations li') lis = [] for el in els: el = PyQuery(el) if el.has_class('li-ad'): continue if el.is_('header'): break lis.append(el) for li in lis: i = PyQuery(li('span.play')) text = i.parents('li').eq(0).text() user = None match = re.search("発音したユーザ: (.*) \(", text) if match: user = match.group(1) onclick = i.attr('onclick') match = re.compile(r"Play\(.*,'(.*)',.*,.*,.*,.*,.*\)" ).search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user }) else: match = re.compile( r"PlayPhrase\(.*,'(.*)',.*\)").search(onclick) if match: code = match.group(1) url = 'https://audio00.forvo.com/phrases/mp3/' + \ base64_decode(code) self.results.append({ 'word': word, 'url': url, 'word_id': word_id, 'user': user })
def _render_element(self, p: Paragraph, element: str or Element, is_root=False, bold=False, italic=False, strike=False, underline=False, font_size=None, sup=False, sub=False): """ 转换html节点到word :param element: :return: """ if isinstance(element, str): run = p.add_run(self._clear_text(element)) run.bold = bold run.italic = italic run.font.strike = strike run.font.underline = underline run.font.subscript = sub run.font.superscript = sup if font_size: run.font.size = font_size self.__force_simsun(run) return pq = PyQuery(element) if pq.is_('p'): # 不支持嵌套p,自动扁平化 contents = pq.contents() align = self._get_pq_style(pq, 'text-align') if align == 'center': p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: p.alignment = WD_ALIGN_PARAGRAPH.LEFT if is_root: self._render_children(p, contents) else: sub_p = p._parent.add_paragraph() if align == 'center': sub_p.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == 'right': sub_p.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: sub_p.alignment = WD_ALIGN_PARAGRAPH.LEFT self._render_children(sub_p, contents) elif pq.is_('u'): # 下划线 self.__render_inline_element(p, pq, underline=True, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('strong') or pq.is_('b'): # 加粗 self.__render_inline_element(p, pq, underline=underline, bold=True, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('i') or pq.is_('em'): # 斜体 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('sub'): # 下标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=True, sup=sup) elif pq.is_('sup'): # 上标 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=italic, strike=strike, font_size=font_size, sub=sub, sup=True) elif pq.is_('var'): # 老公式 self.__render_inline_element(p, pq, underline=underline, bold=bold, italic=True, strike=strike, font_size=font_size, sub=sub, sup=sup) elif pq.is_('span'): self._render_span(p, pq, bold=bold, italic=italic, strike=strike, underline=underline, font_size=font_size) elif pq.is_("br"): p.add_run().add_break() elif pq.is_("div"): # sub_p = p._parent.add_paragraph() p.add_run().add_break() self._render_children(p, pq.contents()) elif pq.is_('ul'): self._render_unorder_list(p, pq) elif pq.is_('ol'): self._render_order_list(p, pq) elif pq.is_('table'): self._render_table(p, pq) elif pq.is_('img'): # 图片 self._render_img(p, pq) elif element.tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): sub_p = p._parent.add_paragraph() self.__render_inline_element(sub_p, pq, bold=True, font_size=Pt(12), underline=underline, italic=True, strike=strike, sub=sub, sup=sup) else: sub_p = p._parent.add_paragraph() contents = pq.contents() self._render_children(sub_p, contents)
def is_pq_object_visible(el: PyQuery): if el.is_("script") or el.is_("noscript") or el.is_("style"): return False else: return True