def parse_mkdocs(content, section, url): # pylint: disable=unused-argument """Get the embed content for the section.""" ret = [] headers = [] if not content or not content.get('content'): return (None, None, section) body = content['content'] for element in PQ(body)('h2'): headers.append(recurse_while_none(element)) if not section and headers: # If no section is sent, return the content of the first one section = list(headers[0].keys())[0].lower() if section: body_obj = PQ(body) escaped_section = escape_selector(section) section_list = body_obj( ':header:contains("{title}")'.format(title=str(escaped_section))) for num in range(len(section_list)): header2 = section_list.eq(num) # h2_title = h2.text().strip() # section_id = h2.attr('id') h2_content = "" next_p = header2.next() while next_p: if next_p[0].tag == 'h2': break h2_html = next_p.outerHtml() if h2_html: h2_content += "\n%s\n" % h2_html next_p = next_p.next() if h2_content: ret.append(h2_content) # ret.append({ # 'id': section_id, # 'title': h2_title, # 'content': h2_content, # }) return (ret, headers, section)
def parse_sphinx(content, section, url): """Get the embed content for the section.""" body = content.get('body') toc = content.get('toc') if not content or not body or not toc: return (None, None, section) headers = [recurse_while_none(element) for element in PQ(toc)('a')] if not section and headers: # If no section is sent, return the content of the first one # TODO: This will always be the full page content, # lets do something smarter here section = list(headers[0].keys())[0].lower() if not section: return [], headers, None body_obj = PQ(body) escaped_section = escape_selector(section) elements_id = [ escaped_section, slugify(escaped_section), make_id(escaped_section), f'module-{escaped_section}', ] query_result = [] for element_id in elements_id: if not element_id: continue query_result = body_obj(f'#{element_id}') if query_result: break if not query_result: selector = f':header:contains("{escaped_section}")' query_result = body_obj(selector).parent() # Handle ``dt`` special cases if len(query_result) == 1 and query_result[0].tag == 'dt': parent = query_result.parent() if 'glossary' in parent.attr('class'): # Sphinx HTML structure for term glossary puts the ``id`` in the # ``dt`` element with the title of the term. In this case, we # need to return the next sibling which contains the definition # of the term itself. # Structure: # <dl class="glossary docutils"> # <dt id="term-definition">definition</dt> # <dd>Text definition for the term</dd> # ... # </dl> query_result = query_result.next() elif 'citation' in parent.attr('class'): # Sphinx HTML structure for sphinxcontrib-bibtex puts the ``id`` in the # ``dt`` element with the title of the cite. In this case, we # need to return the next sibling which contains the cite itself. # Structure: # <dl class="citation"> # <dt id="cite-id"><span><a>Title of the cite</a></span></dt> # <dd>Content of the cite</dd> # ... # </dl> query_result = query_result.next() else: # Sphinx HTML structure for definition list puts the ``id`` # the ``dt`` element, instead of the ``dl``. This makes # the backend to return just the title of the definition. If we # detect this case, we return the parent (the whole ``dl``) # Structure: # <dl class="confval"> # <dt id="confval-config"> # <code class="descname">config</code> # <a class="headerlink" href="#confval-config">¶</a></dt> # <dd><p>Text with a description</p></dd> # </dl> query_result = parent def dump(obj): """Handle API-based doc HTML.""" if obj[0].tag in ['span', 'h2']: return obj.parent().outerHtml() return obj.outerHtml() ret = [dump(clean_links(PQ(obj), url)) for obj in query_result] return ret, headers, section