예제 #1
0
def educateQuotes(text, language='en'):
    # type: (str, str) -> str
    """
    Parameter:  - text string (unicode or bytes).
                - language (`BCP 47` language tag.)
    Returns:    The `text`, with "educated" curly quote characters.

    Example input:  "Isn't this fun?"
    Example output: “Isn’t this fun?“;
    """

    smart = smartquotes.smartchars(language)
    try:
        apostrophe = smart.apostrophe
    except Exception:
        apostrophe = u'’'

    # oldtext = text
    punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""

    # Special case if the very first character is a quote
    # followed by punctuation at a non-word-break.
    # Close the quotes by brute force:
    text = re.sub(r"""^'(?=%s\\B)""" % (punct_class,), smart.csquote, text)
    text = re.sub(r"""^"(?=%s\\B)""" % (punct_class,), smart.cpquote, text)

    # Special case for double sets of quotes, e.g.:
    #   <p>He said, "'Quoted' words in a larger quote."</p>
    text = re.sub(r""""'(?=\w)""", smart.opquote + smart.osquote, text)
    text = re.sub(r"""'"(?=\w)""", smart.osquote + smart.opquote, text)

    # Special case for decade abbreviations (the '80s):
    if language.startswith('en'):  # TODO similar cases in other languages?
        text = re.sub(r"""'(?=\d{2}s)""", apostrophe, text, re.UNICODE)

    close_class = r"""[^\ \t\r\n\[\{\(\-]"""
    dec_dashes = r"""&#8211;|&#8212;"""

    # Get most opening single quotes:
    opening_single_quotes_regex = re.compile(r"""
                    (
                            \s          |   # a whitespace char, or
                            &nbsp;      |   # a non-breaking space entity, or
                            --          |   # dashes, or
                            &[mn]dash;  |   # named dash entities
                            %s          |   # or decimal entities
                            &\#x201[34];    # or hex
                    )
                    '                 # the quote
                    (?=\w)            # followed by a word character
                    """ % (dec_dashes,), re.VERBOSE | re.UNICODE)
    text = opening_single_quotes_regex.sub(r'\1' + smart.osquote, text)

    # In many locales, single closing quotes are different from apostrophe:
    if smart.csquote != apostrophe:
        apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE)
        text = apostrophe_regex.sub(apostrophe, text)
    # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
    # "Ich fass' es nicht."

    closing_single_quotes_regex = re.compile(r"""
                    (%s)
                    '
                    (?!\s  |       # whitespace
                       s\b |
                        \d         # digits   ('80s)
                    )
                    """ % (close_class,), re.VERBOSE | re.UNICODE)
    text = closing_single_quotes_regex.sub(r'\1' + smart.csquote, text)

    closing_single_quotes_regex = re.compile(r"""
                    (%s)
                    '
                    (\s | s\b)
                    """ % (close_class,), re.VERBOSE | re.UNICODE)
    text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text)

    # Any remaining single quotes should be opening ones:
    text = re.sub(r"""'""", smart.osquote, text)

    # Get most opening double quotes:
    opening_double_quotes_regex = re.compile(r"""
                    (
                            \s          |   # a whitespace char, or
                            &nbsp;      |   # a non-breaking space entity, or
                            --          |   # dashes, or
                            &[mn]dash;  |   # named dash entities
                            %s          |   # or decimal entities
                            &\#x201[34];    # or hex
                    )
                    "                 # the quote
                    (?=\w)            # followed by a word character
                    """ % (dec_dashes,), re.VERBOSE)
    text = opening_double_quotes_regex.sub(r'\1' + smart.opquote, text)

    # Double closing quotes:
    closing_double_quotes_regex = re.compile(r"""
                    #(%s)?   # character that indicates the quote should be closing
                    "
                    (?=\s)
                    """ % (close_class,), re.VERBOSE)
    text = closing_double_quotes_regex.sub(smart.cpquote, text)

    closing_double_quotes_regex = re.compile(r"""
                    (%s)   # character that indicates the quote should be closing
                    "
                    """ % (close_class,), re.VERBOSE)
    text = closing_double_quotes_regex.sub(r'\1' + smart.cpquote, text)

    # Any remaining quotes should be opening ones.
    text = re.sub(r'"', smart.opquote, text)

    return text
예제 #2
0
def educateQuotes(text: str, language: str = 'en') -> str:
    """
    Parameter:  - text string (unicode or bytes).
                - language (`BCP 47` language tag.)
    Returns:    The `text`, with "educated" curly quote characters.

    Example input:  "Isn't this fun?"
    Example output: “Isn’t this fun?“;
    """

    smart = smartquotes.smartchars(language)
    try:
        apostrophe = smart.apostrophe
    except Exception:
        apostrophe = '’'

    # oldtext = text
    punct_class = r"""[!"#\$\%'()*+,-.\/:;<=>?\@\[\\\]\^_`{|}~]"""

    # Special case if the very first character is a quote
    # followed by punctuation at a non-word-break.
    # Close the quotes by brute force:
    text = re.sub(r"""^'(?=%s\\B)""" % (punct_class, ), smart.csquote, text)
    text = re.sub(r"""^"(?=%s\\B)""" % (punct_class, ), smart.cpquote, text)

    # Special case for double sets of quotes, e.g.:
    #   <p>He said, "'Quoted' words in a larger quote."</p>
    text = re.sub(r""""'(?=\w)""", smart.opquote + smart.osquote, text)
    text = re.sub(r"""'"(?=\w)""", smart.osquote + smart.opquote, text)

    # Special case for decade abbreviations (the '80s):
    if language.startswith('en'):  # TODO similar cases in other languages?
        text = re.sub(r"""'(?=\d{2}s)""", apostrophe, text, flags=re.UNICODE)

    close_class = r"""[^\ \t\r\n\[\{\(\-]"""
    dec_dashes = r"""&#8211;|&#8212;"""

    # Get most opening single quotes:
    opening_single_quotes_regex = re.compile(
        r"""
                    (
                            \s          |   # a whitespace char, or
                            &nbsp;      |   # a non-breaking space entity, or
                            --          |   # dashes, or
                            &[mn]dash;  |   # named dash entities
                            %s          |   # or decimal entities
                            &\#x201[34];    # or hex
                    )
                    '                 # the quote
                    (?=\w)            # followed by a word character
                    """ % (dec_dashes, ), re.VERBOSE | re.UNICODE)
    text = opening_single_quotes_regex.sub(r'\1' + smart.osquote, text)

    # In many locales, single closing quotes are different from apostrophe:
    if smart.csquote != apostrophe:
        apostrophe_regex = re.compile(r"(?<=(\w|\d))'(?=\w)", re.UNICODE)
        text = apostrophe_regex.sub(apostrophe, text)
    # TODO: keep track of quoting level to recognize apostrophe in, e.g.,
    # "Ich fass' es nicht."

    closing_single_quotes_regex = re.compile(
        r"""
                    (%s)
                    '
                    (?!\s  |       # whitespace
                       s\b |
                        \d         # digits   ('80s)
                    )
                    """ % (close_class, ), re.VERBOSE | re.UNICODE)
    text = closing_single_quotes_regex.sub(r'\1' + smart.csquote, text)

    closing_single_quotes_regex = re.compile(
        r"""
                    (%s)
                    '
                    (\s | s\b)
                    """ % (close_class, ), re.VERBOSE | re.UNICODE)
    text = closing_single_quotes_regex.sub(r'\1%s\2' % smart.csquote, text)

    # Any remaining single quotes should be opening ones:
    text = re.sub(r"""'""", smart.osquote, text)

    # Get most opening double quotes:
    opening_double_quotes_regex = re.compile(
        r"""
                    (
                            \s          |   # a whitespace char, or
                            &nbsp;      |   # a non-breaking space entity, or
                            --          |   # dashes, or
                            &[mn]dash;  |   # named dash entities
                            %s          |   # or decimal entities
                            &\#x201[34];    # or hex
                    )
                    "                 # the quote
                    (?=\w)            # followed by a word character
                    """ % (dec_dashes, ), re.VERBOSE)
    text = opening_double_quotes_regex.sub(r'\1' + smart.opquote, text)

    # Double closing quotes:
    closing_double_quotes_regex = re.compile(
        r"""
                    #(%s)?   # character that indicates the quote should be closing
                    "
                    (?=\s)
                    """ % (close_class, ), re.VERBOSE)
    text = closing_double_quotes_regex.sub(smart.cpquote, text)

    closing_double_quotes_regex = re.compile(
        r"""
                    (%s)   # character that indicates the quote should be closing
                    "
                    """ % (close_class, ), re.VERBOSE)
    text = closing_double_quotes_regex.sub(r'\1' + smart.cpquote, text)

    # Any remaining quotes should be opening ones.
    text = re.sub(r'"', smart.opquote, text)

    return text