"-de", "-en", "-es", "-fr", "-it", "-nl", "-shell", "stop-words", "mbsp-tags", "-dev"): # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation. if p.startswith("-"): p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a class="twitter-share-button"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online) html = re.sub('href="/pages/(pattern-examples.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(using-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(modeling-.*?)"', 'href="%s\\1"' % url, html) # examples (online) html = re.sub('href="/pages/(.*?)([#|"])', 'href="\\1.html\\2', html) # pages (offline) html = html.replace('src="/media/', 'src="../g/') # images (offline)