def html2rst(text, images_dir): """Converts html, tipically generated by tinyMCE, into rst compatible with Sage documentation. The main job is done by BeautifulSoup, which is much more robust than conventional parsers like HTMLParser, but also several details specific of this context are taken into account, so this code differs from generic approaches like those found on the web. INPUT: - ``text`` -- string -- a chunk of HTML text - ``images_dir`` -- string -- folder where images are stored OUTPUT: - string -- rst text EXAMPLES:: sage: from sagenb.misc.comments2rst import html2rst # optional - beautifulsoup sage: html2rst('<p>Some text with <em>math</em>: $e^{\pi i}=-1$</p>', '') # optional - beautifulsoup u'Some text with *math* : :math:`e^{\\pi i}=-1`\n\n' sage: html2rst('<p>Text with <em>incorrect</p> nesting</em>.', '') # optional - beautifulsoup u'Text with *incorrect* \n\n nesting\n.' sage: html2rst('<pre>Preformatted: \n a+2\n</pre><p> Not preformatted: \n a+2\n</p>', '') # optional - beautifulsoup u'::\n\n Preformatted: \n a+2\n \n Not preformatted: a\\+2\n\n' sage: html2rst('áñ ñá','') # optional - beautifulsoup u'\xe1\xf1 \xf1\xe1' sage: html2rst('<p>some text</p><p>$$</p><p>3.183098861 \cdot 10^{-1}</p><p>$$</p>','') # optional - beautifulsoup u'some text\n\n.. MATH::\n\n 3.183098861 \\cdot 10^{-1}\n\n.. end of math\n\n' """ #replace $$some display latex$$ with #<display>some display latex</display> text = preprocess_display_latex(text) #eliminate nasty text = text.replace(' ', ' ') #ICantBelieveItsBeautifulSoup is better than BeautifulSoup #for html that wasn't generated by humans (like tinyMCE) soup = ICantBelieveItsBeautifulSoup( text, convertEntities=ICantBelieveItsBeautifulSoup.ALL_ENTITIES) #remove all comments comments = soup.findAll(text=lambda text: isinstance(text, Comment)) for comment in comments: comment.extract() # replace_courier(soup) replace_latex(soup) v = Soup2Rst(images_dir) # return v.visit(soup) text = v.visit(soup) more_than_2_blank_lines = re.compile(r'\n\n+', re.MULTILINE) text = more_than_2_blank_lines.sub('\n\n', text) text = replace_xml_entities(text) return text