示例#1
0
def get_blogger_posts(blogger_id):
    max_results = 25
    index = 1
    count = 0
    link = 'http://www.blogger.com/feeds/%s/posts/default?start-index=%%s&max-results=%s' % (
        blogger_id, max_results)

    while 1:
        l = link % index
        print l
        # get posts by link
        #        <link rel="service.post" type="application/atom+xml"
        #            title="hoamon&#39;s sandbox - Atom"
        #            href="http://www.blogger.com/feeds/398420085248706856/posts/default" />
        d = feedparser.parse(l)
        for e in d.entries:
            count += 1
            s = html2text(e.content[0].value)
            filepath = re.sub('^http://blog.hoamon.info/', '', e.link)
            filename = re.sub('.html$', '.rst', os.path.basename(filepath))
            dirname = os.path.join('blogger', os.path.dirname(filepath), '01')
            if not os.path.isdir(dirname):
                os.makedirs(dirname)

            file = open(os.path.join(dirname, filename), 'wb')
            title = e.title + '\n' + '=' * 80 + '\n\n'
            file.write(title)
            file.write(s)

            id = re.sub('^.*\-([0-9]+)$', '\\1', e.id)
            # get comment of one post
            cd = feedparser.parse(
                'http://blog.hoamon.info/feeds/%s/comments/default' % id)
            cdes = cd.entries[:]
            if len(cdes) > 0:
                file.write('\n\nOld Comments in Blogger\n' + '-' * 80 + '\n\n')
                cdes.reverse()
                for ce in cdes:
                    name = ce.author_detail.get('name', 'No Name')
                    href = ce.author_detail.get('href', 'No Href')
                    author = '\n\n`%s <%s>`_ at %s:\n' % (name, href,
                                                          ce.updated)
                    file.write(author + '^' * (len(author) + 10) + '\n\n')
                    cs = html2text(ce.content[0].value)
                    file.write(cs)

            if hasattr(e, 'tags'):
                meta = META % {'tags': ', '.join([i['term'] for i in e.tags])}
            else:
                meta = META % {'tags': ''}
            file.write(meta)
            file.close()
        if len(d.entries) == 0: break
        index += max_results
    return count
示例#2
0
def getRST(html):
    if ReSTed.config["usePandoc"]==True:
        p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
                         stdin=subprocess.PIPE, stdout=subprocess.PIPE)
        return p.communicate(html)[0]
    else:
        return html2rst.html2text(html)
示例#3
0
def getRST(html):
    if ReSTed.config["usePandoc"] == True:
        p = subprocess.Popen(['pandoc', '--from=html', '--to=rst'],
                             stdin=subprocess.PIPE,
                             stdout=subprocess.PIPE)
        return p.communicate(html)[0]
    else:
        return html2rst.html2text(html)
def get_page_content(path, name, title, apiurl):
    try:
        s = urllib2.urlopen(apiurl)
        xml = s.read()
        # print xml
    except Exception, e:
        print "Error: root api ", e
    else:
        s.close()
        dom = ElementTree.fromstring(xml)
        import pdb; pdb.set_trace()
        # dom = _link_path_localize(dom)
        body = dom.findtext('.//body')
        html = "<html><head><title>%s</title></head><body>%s</body></html>" % (title, body,)
        rst = html2text(html)
        save_data_file(HTML_SAVE_DIR, path, name + '.html', html)
        save_data_file(REST_SAVE_DIR, path, name + '.rst', rst)


def get_pages(root='/'):
    try:
        s = urllib2.urlopen(URL_LIST_API)
        try:
            xml = s.read()
        except Exception, e:
            print "Error: root api ", e
        finally:
            s.close()
    except Exception, e:
        print "Error: root api", e
示例#5
0
#!/usr/bin/env python2
# equivalent to
# markdown README.md -e utf-8 -x tables -x wikilinks -x fenced_code -x toc -x def_list | ./html2rst.py > README.rst
import sys
import markdown
import html2rst

md = markdown.Markdown(
            extensions = ['tables', 'fenced_code', 'toc', 'def_list', 'codehilite(force_linenos=True)'],
            extension_configs = {
                "codehilite": ("force_linenos", True),
                }
            )
html = md.convert(open(sys.argv[1] if len(sys.argv) > 1 else 'README.md').read())
open('/tmp/x', 'w').write(html)
print html2rst.html2text(html).decode('utf-8')