def get_blogger_posts(blogger_id): max_results = 25 index = 1 count = 0 link = 'http://www.blogger.com/feeds/%s/posts/default?start-index=%%s&max-results=%s' % ( blogger_id, max_results) while 1: l = link % index print l # get posts by link # <link rel="service.post" type="application/atom+xml" # title="hoamon's sandbox - Atom" # href="http://www.blogger.com/feeds/398420085248706856/posts/default" /> d = feedparser.parse(l) for e in d.entries: count += 1 s = html2text(e.content[0].value) filepath = re.sub('^http://blog.hoamon.info/', '', e.link) filename = re.sub('.html$', '.rst', os.path.basename(filepath)) dirname = os.path.join('blogger', os.path.dirname(filepath), '01') if not os.path.isdir(dirname): os.makedirs(dirname) file = open(os.path.join(dirname, filename), 'wb') title = e.title + '\n' + '=' * 80 + '\n\n' file.write(title) file.write(s) id = re.sub('^.*\-([0-9]+)$', '\\1', e.id) # get comment of one post cd = feedparser.parse( 'http://blog.hoamon.info/feeds/%s/comments/default' % id) cdes = cd.entries[:] if len(cdes) > 0: file.write('\n\nOld Comments in Blogger\n' + '-' * 80 + '\n\n') cdes.reverse() for ce in cdes: name = ce.author_detail.get('name', 'No Name') href = ce.author_detail.get('href', 'No Href') author = '\n\n`%s <%s>`_ at %s:\n' % (name, href, ce.updated) file.write(author + '^' * (len(author) + 10) + '\n\n') cs = html2text(ce.content[0].value) file.write(cs) if hasattr(e, 'tags'): meta = META % {'tags': ', '.join([i['term'] for i in e.tags])} else: meta = META % {'tags': ''} file.write(meta) file.close() if len(d.entries) == 0: break index += max_results return count
def getRST(html): if ReSTed.config["usePandoc"]==True: p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return p.communicate(html)[0] else: return html2rst.html2text(html)
def getRST(html): if ReSTed.config["usePandoc"] == True: p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) return p.communicate(html)[0] else: return html2rst.html2text(html)
def get_page_content(path, name, title, apiurl): try: s = urllib2.urlopen(apiurl) xml = s.read() # print xml except Exception, e: print "Error: root api ", e else: s.close() dom = ElementTree.fromstring(xml) import pdb; pdb.set_trace() # dom = _link_path_localize(dom) body = dom.findtext('.//body') html = "<html><head><title>%s</title></head><body>%s</body></html>" % (title, body,) rst = html2text(html) save_data_file(HTML_SAVE_DIR, path, name + '.html', html) save_data_file(REST_SAVE_DIR, path, name + '.rst', rst) def get_pages(root='/'): try: s = urllib2.urlopen(URL_LIST_API) try: xml = s.read() except Exception, e: print "Error: root api ", e finally: s.close() except Exception, e: print "Error: root api", e
#!/usr/bin/env python2 # equivalent to # markdown README.md -e utf-8 -x tables -x wikilinks -x fenced_code -x toc -x def_list | ./html2rst.py > README.rst import sys import markdown import html2rst md = markdown.Markdown( extensions = ['tables', 'fenced_code', 'toc', 'def_list', 'codehilite(force_linenos=True)'], extension_configs = { "codehilite": ("force_linenos", True), } ) html = md.convert(open(sys.argv[1] if len(sys.argv) > 1 else 'README.md').read()) open('/tmp/x', 'w').write(html) print html2rst.html2text(html).decode('utf-8')