Python regex_search 예제들, utils.regex_search Python 예제들

예제 #1

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

    def convert_blog_parts(self, str_line):
        """Convert blog parts to reST.

        Argument:

            str_line: text string of blog entry.

        """

        # remove hatena internal link
        str_line = convert.remove_internal_link(str_line)

        # for ditto
        str_line = convert.ditto(str_line)

        # remove span element
        str_line = convert.remove_span(str_line)

        # remove del element
        str_line = convert.remove_del(str_line)

        # for google maps
        str_line = convert.google_maps(str_line)

        # for gmodules
        str_line = convert.gmodules(str_line)

        # for img element
        str_line = convert.img2image(str_line)

        # for amazlet
        pat_amazon, m = utils.regex_search('amazlet', str_line)
        if not m:
            pat_image, m = utils.regex_search(
                '(<a href="(.+?)" .+?><img src="(.+?)".*?/?></.+?>)', str_line)
            if m:
                img_path = utils.retrieve_image(m.group(3),
                                                self.dstdir + __imgdir__,
                                                self.retrieve_image_flag)
                str_line = pat_image.sub(
                    '\n.. image:: ' + __imgdir__ + img_path + '\n   :target: '
                    + m.group(2) + '\n\n', str_line)

        # for youtube
        str_line = convert.youtube(str_line)

        # for tweet
        str_line = convert.tweet(str_line)

        # for blogparts
        str_line = convert.extract_blog_parts(str_line)

        return str_line

예제 #2

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

    def fotolife2rest(self, str_line):
        """Convert fotolife to image directive.

        Argument:

            str_line: text string of blog entry.

        convert is
            from: hatena; [f:id:imageid:image]
            to:   reST  ; .. image:: imgsrc
                              :target: uri
        """
        r, m = utils.regex_search(
            '\[f:id:(.*):([0-9]*)[a-z]:image(|:.+?)\]', str_line)
        if m:
            img_uri_partial = ('http://cdn-ak.f.st-hatena.com/images/fotolife/'
                               + m.group(1)[0] + '/' + m.group(1) + '/'
                               + m.group(2)[0:8] + '/' + m.group(2))
            # get image file
            img_src = utils.retrieve_image(img_uri_partial,
                                           self.dstdir + __imgdir__,
                                           self.retrieve_image_flag)
            repl_str = ('\n.. image:: ' + __imgdir__ + img_src)
            str_line = r.sub(repl_str, str_line)
        return str_line

예제 #3

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

    def listing2rest(self, str_line):
        """Convert hatena syntax to reST of list.

        Argument:

            str_line: text string of blog entry.

        """

        for i in range(1, 4)[::-1]:
            """list lv is indent depth
            order is 3,2,1 why short matche is stronger than long.
            3 is --- or +++
            2 is -- or ++
            1 is - or +
            """
            r, m = utils.regex_search(
                '(^(-{%d}))|(^(\+{%d}))' % (i, i), str_line)
            if m:
                item = ('  ' * (i - 1) + '* ' if m.group(1)
                        else '  ' * (i - 1) + '#. ')
                if self.list_lv == i:
                    repl = item
                else:
                    repl = '\n' + item
                    self.list_lv = i
                str_line = r.sub(repl, str_line)
        str_line += '\n'
        return str_line

예제 #4

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

        def extract_tables(string_line, table, tables):
            """Extract tables

            Argument:

                string_line: parsing target string.
                table:       parsing target table
                tables:      parsing target tables
            """
            pat_table, match_obj = utils.regex_search(
                '^\|(.+?)\|$', string_line)
            if match_obj:
                row_data = (match_obj.group(0),
                            match_obj.groups()[0].split('|'))
                if not self.table_flag:
                    # table start
                    self.table_flag = True
                table.append(row_data)
            else:
                if self.table_flag:
                    # table close
                    tables.append(table)
                    table = []
                    self.table_flag = False
            return table, tables

예제 #5

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def tweet(string):
    """Convert blog parts of twitter to reST hyperlink.

    Argument:

        string: blog entry body string.

    """

    pat_comment, m = utils.regex_search(
        '((<!-- (.+?) -->) (<.+?>(.+?)</.+?> )(<!-- (.+) -->))', string)
    if m:
        str_tmp = string.replace(m.group(2), '')
        str_tmp = str_tmp.replace(m.group(6), '')
        pat_style, m2 = utils.regex_search(
            ' <style .+?>(.+?)</style> ', str_tmp)
        if m2:
            str_tmp = str_tmp.replace(m2.group(0), '')
            str_tmp = str_tmp.replace('><', '>\n<')
            str_tmp = str_tmp.replace('> <', '>\n<')
            str_tmp = str_tmp.replace('</span>\n', '')
            pat_tweet = re.compile(
                '((<.+?>(.+?)</.+?>)(.+?)(<.+?>(.+?)</.+?>))')
            m3 = pat_tweet.search(str_tmp)
            if m3:
                pat_anchor = re.compile('<a.+?>')
                tweet_msg = (pat_anchor.sub('', m3.group(3)) +
                             pat_anchor.sub('', m3.group(4))
                             + pat_anchor.sub('', m3.group(5))
                             ).replace('</a>', '')

            if parse_blog_parts(str_tmp.encode('utf-8')):
                uri = parse_blog_parts(str_tmp.encode('utf-8'))
                repl_str = '\n' + uri + '::\n\n   ' + tweet_msg + '\n\n'
            else:
                repl_str = ''
            string = pat_comment.sub(repl_str, string)
    return string

예제 #6

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def img2image(string):
    """Convert html img element to reST image directive.

    Argument:

        string: blog entry body string.

    """

    pat_img, m = utils.regex_search('^<img src="(.+?)" .+?(/?)>', string)
    if m:
        string = pat_img.sub('\n.. image:: ' + m.group(1)
                               + '\n\n', string)
    return string

예제 #7

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def remove_del(string):
    """Remove del element.

    Argument:

        string: blog entry body string.

    """

    pat_del, m = utils.regex_search(
        '(<del( .+?|)>(.+?)</del>)', string)
    if m:
        string = pat_del.sub('', string)
    return string

예제 #8

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def remove_span(string):
    """Remove span element.

    Argument:

        string: blog entry body string.

    """

    pat_span, m = utils.regex_search(
        '(<span .+?>(.+?)</span>)', string)
    if m:
        string = pat_span.sub(m.group(2), string)
    return string

예제 #9

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def replace_shell_variable(string):
    """Replace shell variable

    Argument:

        string: text string of blog entry.
    """

    pat_shell_var, match_obj = utils.regex_search(
        '(\${.+?}[a-zA-Z0-9/_\\\*]+)', string)
    if match_obj:
        string = pat_shell_var.sub(
            ' :command:`' + match_obj.group() + '` ', string)
    return string

예제 #10

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def remove_internal_link(string):
    """Remove hatena internal link.

    Argument:

        string: blog entry body string.

    """

    pat_hatena_internal_link, m = utils.regex_search(
        '(\[\[|\]\])', string)
    if m:
        string = pat_hatena_internal_link.sub('', string)
    return string

예제 #11

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

        def parse_end_ref(string_line):
            """Parse ending of reference block

            Argument:

                string: convert target string.
            """
            pat_end_ref, match_obj = utils.regex_search(
                '^<<', string_line)
            if match_obj:
                string_line = pat_end_ref.sub('\n\n', string_line)
                self.ref_flag = False
            else:
                string_line = re.sub('^', '   ', string_line)
            return string_line

예제 #12

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

        def parse_end_codeblock(string_line):
            """Parse ending of codeblock.

            Argument:

                string_line: parsing target string.
            """
            pat_code_close, match_obj = utils.regex_search(
                '^\|\|<|^\|<$', string_line)
            if match_obj:
                string_line = pat_code_close.sub('\n', string_line)
                # code block closing
                self.code_flag = False
            else:
                string_line = re.sub('^', '   ', string_line)
            return string_line

예제 #13

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def gmodules(string):
    """Convert blog parts of gmodules to reST raw directive.

    Argument:

        string: blog entry body string.

    """

    if (string.find('http://gmodules.com') > 0 or
        string.find('https://gist.github.com') > 0):
        pat_gmodules, m = utils.regex_search(
            '^<script .+?></script>', string)
        if m:
            string = pat_gmodules.sub(
                '\n.. raw:: html\n\n    ' + m.group(0) + '\n', string)
    return string

예제 #14

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def google_maps(string):
    """Convert blog parts of google maps to reST raw directive.

    Argument:

        string: blog entry body string.

    """

    if (string.find('http://maps.google.com/') > 0 or
        string.find('http://maps.google.co.jp/') > 0):
        pat_google_maps, m = utils.regex_search(
            '(<iframe .+?></iframe><br />(<.+?>.+?</.+?>)(.*?)</.+?>)',
            string)
        if m:
            string = pat_google_maps.sub(
                '\n.. raw:: html\n\n    ' + m.group(0) + '\n', string)
    return string

예제 #15

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def youtube(string):
    """Convert blog parts of YouTube to reST raw directive.

    Argument:

        string: blog entry body string.

    """

    if string.find('http://www.youtube.com') > 0:
        pat_youtube, m = utils.regex_search(
            '(<object .+?>(.*?)</.+?>)', string)
        if m:
            string = pat_youtube.sub(m.group(0), string)
            string = string.replace('\n', '')
            string = string.replace('&hl=ja', '')
            string = string.replace('&fs=1', '')
            string = '\n.. raw:: html\n\n   ' + string + '\n'
    return string

예제 #16

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def section2rest(string):
    """Convert hatena syntax to reST of section.

    Argument:

        string: text string of blog entry.
    """

    for i in range(2, 4)[::-1]:
        """2:section, 3:subsection"""
        sep = '-' if i == 2 else '^'
        r, m = utils.regex_search('^(\*){%d}(.*)' % i, string)
        if m:
            pat_space = re.compile('^\s+')
            section_str = pat_space.sub('', m.group(2))
            string = r.sub(
                '\n' + section_str + '\n'
                + sep * utils.length_str(section_str) + '\n', string)
    return string

예제 #17

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

        def parse_begin_ref(string_line):
            """Parse begining of reference block

            Argument:

                string: convert target string.
            """
            pat_start_ref, match_obj = utils.regex_search(
                '^>((http|https)://(.+?)|)>$', string_line)
            if match_obj:
                self.ref_flag = True
                if match_obj.group(1):
                    repl_str = match_obj.group(1)
                else:
                    repl_str = ''

                string_line = pat_start_ref.sub(
                    repl_str,
                    string_line)

            return string_line

예제 #18

0

파일 보기

파일: htnparser.py 프로젝트: mkouhei/hatena2rest

        def parse_begin_codeblock(string_line):
            """Parse begining of code block

            Argument:

                string: convert target string.
            """
            pat_code_open, match_obj = utils.regex_search(
                '>\|([a-zA-Z0-9]*)\|$|>\|()$', string_line)
            if match_obj:
                # code block opening
                self.code_flag = True
                if match_obj.group(1):
                    lexer_str = convert.replace_lexer(match_obj.group(1))
                    string_line = pat_code_open.sub(
                        '\n.. code-block:: ' + lexer_str + '\n',
                        string_line)
                else:
                    string_line = pat_code_open.sub(
                        '\n.. code-block:: sh\n', string_line)
            return string_line

예제 #19

0

파일 보기

파일: convert.py 프로젝트: mkouhei/hatena2rest

def ditto(string):
    """Convert blog parts of twitter with ditto to reST hyperlink.

    Argument:

        string: blog entry body string.

    """

    pat_ditto, m = utils.regex_search(
        '(<style .+?>.+?</style>)(<div .+?>.+?</div>)', string)
    if m:
        ex_ref_char = re.compile('\&(?!amp;)', flags=re.U)
        string = ex_ref_char.sub('&amp;', m.group(2))

        # get uri
        uri = ''
        xmltree = xml.etree.ElementTree.fromstring(string.encode('utf-8'))
        for p_child in xmltree.find('p').getchildren():
            for i, p_child_child in enumerate(p_child.getchildren()):
                if i == 1 and p_child_child.get('href'):
                    uri = p_child_child.get('href')

        # get tweet message
        tweet_msg = ''
        if xmltree.get('class').find('ditto') == 0:
            span_element = xmltree.find('p').find('span').find('span')
            for i, v in enumerate(xmltree.itertext()):
                if i > 1:
                    pat = re.compile('&nbsp;|via', flags=re.U)
                    if pat.search(v) > 0:
                        break
                    else:
                        tweet_msg += str(v.encode('utf-8'))
        repl_str = '\n' + uri + '::\n\n   ' + tweet_msg + '\n\n'
        string = pat_ditto.sub(m.group(), repl_str).decode('utf-8')

    return string