def convert_blog_parts(self, str_line): """Convert blog parts to reST. Argument: str_line: text string of blog entry. """ # remove hatena internal link str_line = convert.remove_internal_link(str_line) # for ditto str_line = convert.ditto(str_line) # remove span element str_line = convert.remove_span(str_line) # remove del element str_line = convert.remove_del(str_line) # for google maps str_line = convert.google_maps(str_line) # for gmodules str_line = convert.gmodules(str_line) # for img element str_line = convert.img2image(str_line) # for amazlet pat_amazon, m = utils.regex_search('amazlet', str_line) if not m: pat_image, m = utils.regex_search( '(<a href="(.+?)" .+?><img src="(.+?)".*?/?></.+?>)', str_line) if m: img_path = utils.retrieve_image(m.group(3), self.dstdir + __imgdir__, self.retrieve_image_flag) str_line = pat_image.sub( '\n.. image:: ' + __imgdir__ + img_path + '\n :target: ' + m.group(2) + '\n\n', str_line) # for youtube str_line = convert.youtube(str_line) # for tweet str_line = convert.tweet(str_line) # for blogparts str_line = convert.extract_blog_parts(str_line) return str_line
def fotolife2rest(self, str_line): """Convert fotolife to image directive. Argument: str_line: text string of blog entry. convert is from: hatena; [f:id:imageid:image] to: reST ; .. image:: imgsrc :target: uri """ r, m = utils.regex_search( '\[f:id:(.*):([0-9]*)[a-z]:image(|:.+?)\]', str_line) if m: img_uri_partial = ('http://cdn-ak.f.st-hatena.com/images/fotolife/' + m.group(1)[0] + '/' + m.group(1) + '/' + m.group(2)[0:8] + '/' + m.group(2)) # get image file img_src = utils.retrieve_image(img_uri_partial, self.dstdir + __imgdir__, self.retrieve_image_flag) repl_str = ('\n.. image:: ' + __imgdir__ + img_src) str_line = r.sub(repl_str, str_line) return str_line
def listing2rest(self, str_line): """Convert hatena syntax to reST of list. Argument: str_line: text string of blog entry. """ for i in range(1, 4)[::-1]: """list lv is indent depth order is 3,2,1 why short matche is stronger than long. 3 is --- or +++ 2 is -- or ++ 1 is - or + """ r, m = utils.regex_search( '(^(-{%d}))|(^(\+{%d}))' % (i, i), str_line) if m: item = (' ' * (i - 1) + '* ' if m.group(1) else ' ' * (i - 1) + '#. ') if self.list_lv == i: repl = item else: repl = '\n' + item self.list_lv = i str_line = r.sub(repl, str_line) str_line += '\n' return str_line
def extract_tables(string_line, table, tables): """Extract tables Argument: string_line: parsing target string. table: parsing target table tables: parsing target tables """ pat_table, match_obj = utils.regex_search( '^\|(.+?)\|$', string_line) if match_obj: row_data = (match_obj.group(0), match_obj.groups()[0].split('|')) if not self.table_flag: # table start self.table_flag = True table.append(row_data) else: if self.table_flag: # table close tables.append(table) table = [] self.table_flag = False return table, tables
def tweet(string): """Convert blog parts of twitter to reST hyperlink. Argument: string: blog entry body string. """ pat_comment, m = utils.regex_search( '((<!-- (.+?) -->) (<.+?>(.+?)</.+?> )(<!-- (.+) -->))', string) if m: str_tmp = string.replace(m.group(2), '') str_tmp = str_tmp.replace(m.group(6), '') pat_style, m2 = utils.regex_search( ' <style .+?>(.+?)</style> ', str_tmp) if m2: str_tmp = str_tmp.replace(m2.group(0), '') str_tmp = str_tmp.replace('><', '>\n<') str_tmp = str_tmp.replace('> <', '>\n<') str_tmp = str_tmp.replace('</span>\n', '') pat_tweet = re.compile( '((<.+?>(.+?)</.+?>)(.+?)(<.+?>(.+?)</.+?>))') m3 = pat_tweet.search(str_tmp) if m3: pat_anchor = re.compile('<a.+?>') tweet_msg = (pat_anchor.sub('', m3.group(3)) + pat_anchor.sub('', m3.group(4)) + pat_anchor.sub('', m3.group(5)) ).replace('</a>', '') if parse_blog_parts(str_tmp.encode('utf-8')): uri = parse_blog_parts(str_tmp.encode('utf-8')) repl_str = '\n' + uri + '::\n\n ' + tweet_msg + '\n\n' else: repl_str = '' string = pat_comment.sub(repl_str, string) return string
def img2image(string): """Convert html img element to reST image directive. Argument: string: blog entry body string. """ pat_img, m = utils.regex_search('^<img src="(.+?)" .+?(/?)>', string) if m: string = pat_img.sub('\n.. image:: ' + m.group(1) + '\n\n', string) return string
def remove_del(string): """Remove del element. Argument: string: blog entry body string. """ pat_del, m = utils.regex_search( '(<del( .+?|)>(.+?)</del>)', string) if m: string = pat_del.sub('', string) return string
def remove_span(string): """Remove span element. Argument: string: blog entry body string. """ pat_span, m = utils.regex_search( '(<span .+?>(.+?)</span>)', string) if m: string = pat_span.sub(m.group(2), string) return string
def replace_shell_variable(string): """Replace shell variable Argument: string: text string of blog entry. """ pat_shell_var, match_obj = utils.regex_search( '(\${.+?}[a-zA-Z0-9/_\\\*]+)', string) if match_obj: string = pat_shell_var.sub( ' :command:`' + match_obj.group() + '` ', string) return string
def remove_internal_link(string): """Remove hatena internal link. Argument: string: blog entry body string. """ pat_hatena_internal_link, m = utils.regex_search( '(\[\[|\]\])', string) if m: string = pat_hatena_internal_link.sub('', string) return string
def parse_end_ref(string_line): """Parse ending of reference block Argument: string: convert target string. """ pat_end_ref, match_obj = utils.regex_search( '^<<', string_line) if match_obj: string_line = pat_end_ref.sub('\n\n', string_line) self.ref_flag = False else: string_line = re.sub('^', ' ', string_line) return string_line
def parse_end_codeblock(string_line): """Parse ending of codeblock. Argument: string_line: parsing target string. """ pat_code_close, match_obj = utils.regex_search( '^\|\|<|^\|<$', string_line) if match_obj: string_line = pat_code_close.sub('\n', string_line) # code block closing self.code_flag = False else: string_line = re.sub('^', ' ', string_line) return string_line
def gmodules(string): """Convert blog parts of gmodules to reST raw directive. Argument: string: blog entry body string. """ if (string.find('http://gmodules.com') > 0 or string.find('https://gist.github.com') > 0): pat_gmodules, m = utils.regex_search( '^<script .+?></script>', string) if m: string = pat_gmodules.sub( '\n.. raw:: html\n\n ' + m.group(0) + '\n', string) return string
def google_maps(string): """Convert blog parts of google maps to reST raw directive. Argument: string: blog entry body string. """ if (string.find('http://maps.google.com/') > 0 or string.find('http://maps.google.co.jp/') > 0): pat_google_maps, m = utils.regex_search( '(<iframe .+?></iframe><br />(<.+?>.+?</.+?>)(.*?)</.+?>)', string) if m: string = pat_google_maps.sub( '\n.. raw:: html\n\n ' + m.group(0) + '\n', string) return string
def youtube(string): """Convert blog parts of YouTube to reST raw directive. Argument: string: blog entry body string. """ if string.find('http://www.youtube.com') > 0: pat_youtube, m = utils.regex_search( '(<object .+?>(.*?)</.+?>)', string) if m: string = pat_youtube.sub(m.group(0), string) string = string.replace('\n', '') string = string.replace('&hl=ja', '') string = string.replace('&fs=1', '') string = '\n.. raw:: html\n\n ' + string + '\n' return string
def section2rest(string): """Convert hatena syntax to reST of section. Argument: string: text string of blog entry. """ for i in range(2, 4)[::-1]: """2:section, 3:subsection""" sep = '-' if i == 2 else '^' r, m = utils.regex_search('^(\*){%d}(.*)' % i, string) if m: pat_space = re.compile('^\s+') section_str = pat_space.sub('', m.group(2)) string = r.sub( '\n' + section_str + '\n' + sep * utils.length_str(section_str) + '\n', string) return string
def parse_begin_ref(string_line): """Parse begining of reference block Argument: string: convert target string. """ pat_start_ref, match_obj = utils.regex_search( '^>((http|https)://(.+?)|)>$', string_line) if match_obj: self.ref_flag = True if match_obj.group(1): repl_str = match_obj.group(1) else: repl_str = '' string_line = pat_start_ref.sub( repl_str, string_line) return string_line
def parse_begin_codeblock(string_line): """Parse begining of code block Argument: string: convert target string. """ pat_code_open, match_obj = utils.regex_search( '>\|([a-zA-Z0-9]*)\|$|>\|()$', string_line) if match_obj: # code block opening self.code_flag = True if match_obj.group(1): lexer_str = convert.replace_lexer(match_obj.group(1)) string_line = pat_code_open.sub( '\n.. code-block:: ' + lexer_str + '\n', string_line) else: string_line = pat_code_open.sub( '\n.. code-block:: sh\n', string_line) return string_line
def ditto(string): """Convert blog parts of twitter with ditto to reST hyperlink. Argument: string: blog entry body string. """ pat_ditto, m = utils.regex_search( '(<style .+?>.+?</style>)(<div .+?>.+?</div>)', string) if m: ex_ref_char = re.compile('\&(?!amp;)', flags=re.U) string = ex_ref_char.sub('&', m.group(2)) # get uri uri = '' xmltree = xml.etree.ElementTree.fromstring(string.encode('utf-8')) for p_child in xmltree.find('p').getchildren(): for i, p_child_child in enumerate(p_child.getchildren()): if i == 1 and p_child_child.get('href'): uri = p_child_child.get('href') # get tweet message tweet_msg = '' if xmltree.get('class').find('ditto') == 0: span_element = xmltree.find('p').find('span').find('span') for i, v in enumerate(xmltree.itertext()): if i > 1: pat = re.compile(' |via', flags=re.U) if pat.search(v) > 0: break else: tweet_msg += str(v.encode('utf-8')) repl_str = '\n' + uri + '::\n\n ' + tweet_msg + '\n\n' string = pat_ditto.sub(m.group(), repl_str).decode('utf-8') return string