예제 #1
파일: ellipsis.py 프로젝트: yushu-liu/pySBD
class EllipsisRules(object):

    # below rules aren't similar to original rules of pragmatic segmenter
    # modification: spaces replaced with same number of symbols
    # Rubular: http://rubular.com/r/i60hCK81fz
    ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')

    # Rubular: http://rubular.com/r/Hdqpd90owl
    FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')

    # Rubular: http://rubular.com/r/YBG1dIHTRu
    ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')

    # Rubular: http://rubular.com/r/2VvZ8wRbd8
    FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')

    OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')

    All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
           ThreeConsecutiveRule, OtherThreePeriodRule]
예제 #2
파일: numbers.py 프로젝트: yushu-liu/pySBD
class Numbers(object):
    # Rubular: http://rubular.com/r/oNyxBOqbyy
    PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')

    # Rubular: http://rubular.com/r/EMk5MpiUzt
    NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')

    # Rubular: http://rubular.com/r/rf4l1HjtjG
    NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')

    # Rubular: http://rubular.com/r/HPa4sdc6b9
    StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')

    # Rubular: http://rubular.com/r/NuvWnKleFl
    StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')

    All = [
        PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule,
        NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule,
예제 #3
class Persian(Common, Standard):

    iso_code = 'fa'

    Punctuations = ['?', '!', ':', '.', '؟']
    SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$'

    # Rubular: http://rubular.com/r/RX5HpdDIyv
    ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭')

    # Rubular: http://rubular.com/r/kPRgApNHUg
    ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬')

    class AbbreviationReplacer(AbbreviationReplacer):


        def __init__(self, text, lang):
            super().__init__(text, lang)

        def scan_for_replacements(self, txt, am, index, character_array):
            txt = re.sub('(?<={0})\.'.format(am), '∯', txt)
            return txt
예제 #4
    class Abbreviation(object):
        """Defines the abbreviations for each language (if available)"""
            'adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz',
            'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld',
            'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt',
            'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp',
            'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det',
            'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp',
            'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft',
            'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway',
            'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing',
            'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken',
            'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md',
            'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss',
            'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr',
            'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov',
            'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa',
            'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz',
            'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps',
            'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept',
            'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex',
            'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt',
            'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig'
            'adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr',
            'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt',
            'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen',
            'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig'
        NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']

        # Rubular: http://rubular.com/r/EUbZCNfgei
        # WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')
        # \w in python matches unicode abbreviations also so limit to english alphanumerics
        WithMultiplePeriodsAndEmailRule = Rule(
            r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
예제 #5
파일: standard.py 프로젝트: yushu-liu/pySBD
class Standard(object):

    # This class holds the punctuation marks.
    Punctuations = ['。', '.', '.', '!', '!', '?', '?']

    # Rubular: http://rubular.com/r/G2opjedIm9
    GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')

    FileFormatRule = Rule(

    SingleNewLineRule = Rule(r'\n', 'ȹ')

    # Rubular: http://rubular.com/r/aXPUGm6fQh
    QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')

    ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')

    SubSingleQuoteRule = Rule(r'&⎋&', "'")
예제 #6
 class SubSymbolsRules(object):
     Period = Rule(r'∯', '.')
     ArabicComma = Rule(r'♬', '،')
     SemiColon = Rule(r'♭', ':')
     FullWidthPeriod = Rule(r'&ᓰ&', '。')
     SpecialPeriod = Rule(r'&ᓱ&', '.')
     FullWidthExclamation = Rule(r'&ᓳ&', '!')
     ExclamationPoint = Rule(r'&ᓴ&', '!')
     QuestionMark = Rule(r'&ᓷ&', '?')
     FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
     MixedDoubleQE = Rule(r'☉', '?!')
     MixedDoubleQQ = Rule(r'☇', '??')
     MixedDoubleEQ = Rule(r'☈', '!?')
     MixedDoubleEE = Rule(r'☄', '!!')
     LeftParens = Rule(r'&✂&', '(')
     RightParens = Rule(r'&⌬&', ')')
     TemporaryEndingPunctutation = Rule(r'ȸ', '')
     Newline = Rule(r'ȹ', "\n")
     All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
            FullWidthExclamation, ExclamationPoint, QuestionMark,
            FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
            MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
예제 #7
class Standard:

    # This class holds the punctuation marks.
    Punctuations = ['。', '.', '.', '!', '!', '?', '?']

    # Rubular: http://rubular.com/r/G2opjedIm9
    GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯')

    FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯')

    SingleNewLineRule = Rule(r'\n', 'ȹ')

    # Rubular: http://rubular.com/r/aXPUGm6fQh
    QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&')

    ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ')

    SubSingleQuoteRule = Rule(r'&⎋&', "'")

    class Abbreviation(object):
        """Defines the abbreviations for each language (if available)"""
        ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig']
        PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig']
        NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp']

        # Part of "Abbreviations" ruby module
        # Rubular: http://rubular.com/r/EUbZCNfgei
        WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3')

    class DoublePunctuationRules(object):
        FirstRule = Rule(r'\?!', '☉')
        SecondRule = Rule(r'!\?', '☈')
        ThirdRule = Rule(r'\?\?', '☇')
        ForthRule = Rule(r'!!', '☄')
        DoublePunctuation = r'\?!|!\?|\?\?|!!'
        All = [FirstRule, SecondRule, ThirdRule, ForthRule]

    class ExclamationPointRules(object):
        # Rubular: http://rubular.com/r/XS1XXFRfM2
        InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&')

        # Rubular: http://rubular.com/r/sl57YI8LkA
        BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&')

        # Rubular: http://rubular.com/r/f9zTjmkIPb
        MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&')

        All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]

    class SubSymbolsRules(object):
        Period = Rule(r'∯', '.')
        ArabicComma = Rule(r'♬', '،')
        SemiColon = Rule(r'♭', ':')
        FullWidthPeriod = Rule(r'&ᓰ&', '。')
        SpecialPeriod = Rule(r'&ᓱ&', '.')
        FullWidthExclamation = Rule(r'&ᓳ&', '!')
        ExclamationPoint = Rule(r'&ᓴ&', '!')
        QuestionMark = Rule(r'&ᓷ&', '?')
        FullWidthQuestionMark = Rule(r'&ᓸ&', '?')
        MixedDoubleQE = Rule(r'☉', '?!')
        MixedDoubleQQ = Rule(r'☇', '??')
        MixedDoubleEQ = Rule(r'☈', '!?')
        MixedDoubleEE = Rule(r'☄', '!!')
        LeftParens = Rule(r'&✂&', '(')
        RightParens = Rule(r'&⌬&', ')')
        TemporaryEndingPunctutation = Rule(r'ȸ', '')
        Newline = Rule(r'ȹ', "\n")
        All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
               FullWidthExclamation, ExclamationPoint, QuestionMark,
               FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
               MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,

    class EllipsisRules(object):

        # below rules aren't similar to original rules of pragmatic segmenter
        # modification: spaces replaced with same number of symbols
        # Rubular: http://rubular.com/r/i60hCK81fz
        ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.')

        # Rubular: http://rubular.com/r/Hdqpd90owl
        FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ')

        # Rubular: http://rubular.com/r/YBG1dIHTRu
        ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟')

        # Rubular: http://rubular.com/r/2VvZ8wRbd8
        FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝')

        OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ')

        All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
               ThreeConsecutiveRule, OtherThreePeriodRule]

    class ReinsertEllipsisRules(object):
        # below rules aren't similar to original rules of pragmatic segmenter
        # modification: symbols replaced with same number of ellipses
        SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...')
        SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ')
        SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .')
        SubTwoConsecutivePeriod = Rule(r'☏☏', '..')
        SubOnePeriod = Rule(r'∮', '.')
        All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
               SubTwoConsecutivePeriod, SubOnePeriod]

    class AbbreviationReplacer(AbbreviationReplacer):
        SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\
            "More She That The There They We What When Where Who Why".split(" ")
예제 #8
파일: rules.py 프로젝트: yushu-liu/pySBD
class PDF(object):
    # Rubular: http://rubular.com/r/UZAVcwqck8
    NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '')

    # Rubular: http://rubular.com/r/eaNwGavmdo
    NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
예제 #9
파일: rules.py 프로젝트: yushu-liu/pySBD
class CleanRules(object):

    # NOTE: Caution: Might require \\ for special characters
    # if regex is defined with r'' then dont
    # add extra \\ for special characters
    # Rubular: http://rubular.com/r/V57WnM9Zut
    NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '')

    # Rubular: http://rubular.com/r/dMxp5MixFS
    DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r")

    # Rubular: http://rubular.com/r/H6HOJeA8bq
    DoubleNewLineRule = Rule(r'\n\n', "\r")

    # Rubular: http://rubular.com/r/FseyMiiYFT
    NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '')

    ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r")

    EscapedNewLineRule = Rule(r'\\n', "\n")

    EscapedCarriageReturnRule = Rule(r'\\r', "\r")

    TypoEscapedNewLineRule = Rule(r'\\\ n', "\n")

    TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r")

    # Rubular: http://rubular.com/r/bAJrhyLNeZ
    InlineFormattingRule = Rule(r'{b\^&gt;\d*&lt;b\^}|{b\^>\d*<b\^}', '')

    # Rubular: http://rubular.com/r/8mc1ArOIGy
    TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r")

    # Rubular: http://rubular.com/r/DwNSuZrNtk
    ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ')

    # Rubular: http://rubular.com/r/IQ4TPfsbd8
    ConsecutiveForwardSlashRule = Rule(r'\/{3}', '')

    # Rubular: http://rubular.com/r/6dt98uI76u
    NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])'
    NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ')

    # Rubular: http://rubular.com/r/l6KN6rH5XE
    NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ')

    URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//']

    # Rubular: http://rubular.com/r/3GiRiP2IbD
    NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())'

    # Rubular: http://rubular.com/r/Gn18aAnLdZ
    NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r")

    QuotationsFirstRule = Rule(r"''", '"')
    QuotationsSecondRule = Rule(r'``', '"')
예제 #10
class Common(object):

    # added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc.
    # r"[。..!!?] at end to handle single instances of these symbol inputs
    SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]"

    # # Rubular: http://rubular.com/r/NqCqv372Ix
    QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]'

    # # Rubular: http://rubular.com/r/6flGnUMEVl
    PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]'

    # # Rubular: http://rubular.com/r/TYzr4qOW1Q
    # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/

    # # Rubular: http://rubular.com/r/JMjlZHAT4g
    SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])'

    # # Rubular: http://rubular.com/r/mQ8Es9bxtk
    CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))'

    # https://rubular.com/r/UkumQaILKbkeyc
    # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703
    NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])'

    # # Rubular: http://rubular.com/r/yqa4Rit8EY
    PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯')

    # # Rubular: http://rubular.com/r/NEv265G2X2
    KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯')

    # # Rubular: http://rubular.com/r/xDkpFZ0EgH
    MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]"

    class SingleLetterAbbreviationRules(object):
        """Searches for periods within an abbreviation and
        replaces the periods.
        # Rubular: http://rubular.com/r/e3H6kwnr6H
        SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯')

        # Rubular: http://rubular.com/r/gitvf0YWH4
        SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')

        All = [
            SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule

    class AmPmRules(object):

        # Rubular: http://rubular.com/r/Vnx3m4Spc8
        UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/AJMCotJVbW
        UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/13q7SnOhgA
        LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.')

        # Rubular: http://rubular.com/r/DgUDq4mLz5
        LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.')

        All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]

    class Numbers(object):
        # Rubular: http://rubular.com/r/oNyxBOqbyy
        PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯')

        # Rubular: http://rubular.com/r/EMk5MpiUzt
        NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯')

        # Rubular: http://rubular.com/r/rf4l1HjtjG
        NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯')

        # Rubular: http://rubular.com/r/HPa4sdc6b9
        StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯')

        # Rubular: http://rubular.com/r/NuvWnKleFl
        StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯')

        All = [
예제 #11
 def remove_newline_in_middle_of_word(self):
     NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
     self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
예제 #12
class ListItemReplacer(object):

    ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ')
    LATIN_NUMERALS = list(string.ascii_lowercase)

    # Rubular: http://rubular.com/r/XcpaJKH0sz
    ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)'

    # Rubular: http://rubular.com/r/Gu5rQapywf
    # TODO: Make sure below regex call is case-insensitive
    ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'

    # (pattern, replacement)
    SubstituteListPeriodRule = Rule('♨', '∯')
    ListMarkerRule = Rule('☝', '')

    # Rubular: http://rubular.com/r/Wv4qLdoPx7
    # https://regex101.com/r/62YBlv/1
    SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r")

    # Rubular: http://rubular.com/r/AizHXC6HxK
    # https://regex101.com/r/62YBlv/2
    SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r")

    # Rubular: http://rubular.com/r/GE5q6yID2j
    # https://regex101.com/r/62YBlv/3
    SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r")

    NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))'
    # 1. abcd
    # 2. xyz
    NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))'
    # 1) abcd
    # 2) xyz
    NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)'

    # Rubular: http://rubular.com/r/NsNFSqrNvJ
    # TODO: Make sure below regex call is case-insensitive
    EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))'

    # Rubular: http://rubular.com/r/wMpnVedEIb
    # TODO: Make sure below regex call is case-insensitive
    ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.'

    # Rubular: http://rubular.com/r/GcnmQt4a3I
    ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])'

    def __init__(self, text):
        self.text = text

    def add_line_break(self):
        return self.text

    def replace_parens(self):
        text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES,
                      r'&✂&\1&⌬&', self.text)
        return text

    def format_numbered_list_with_parens(self):
        self.text = Text(self.text).apply(self.ListMarkerRule)

    def replace_periods_in_numbered_list(self):
        self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2,
                        '♨', strip=True)

    def format_numbered_list_with_periods(self):
        self.text = Text(self.text).apply(self.SubstituteListPeriodRule)

    def format_alphabetical_lists(self):
        self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
        self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
        return self.txt

    def format_roman_numeral_lists(self):
        self.txt = self.add_line_breaks_for_alphabetical_list_with_periods(
        self.txt = self.add_line_breaks_for_alphabetical_list_with_parens(
        return self.txt

    def add_line_breaks_for_alphabetical_list_with_periods(
            self, roman_numeral=False):
        txt = self.iterate_alphabet_array(
        return txt

    def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False):
        txt = self.iterate_alphabet_array(
        return txt

    def scan_lists(self, regex1, regex2, replacement, strip=False):
        list_array = re.findall(regex1, self.text)
        list_array = list(map(int, list_array))
        for ind, item in enumerate(list_array):
            # to avoid IndexError
            # ruby returns nil if index is out of range
            if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]):
                self.substitute_found_list_items(regex2, item, strip, replacement)
            elif ind > 0:
                if (((item - 1) == list_array[ind - 1]) or
                    ((item == 0) and (list_array[ind - 1] == 9)) or
                    ((item == 9) and (list_array[ind - 1] == 0))):
                    self.substitute_found_list_items(regex2, item, strip, replacement)

    def substitute_found_list_items(self, regex, each, strip, replacement):

        def replace_item(match, val=None, strip=False, repl='♨'):
            match = match.group()
            if strip:
                match = str(match).strip()
            chomped_match = match if len(match) == 1 else match.strip('.])')
            if str(each) == chomped_match:
                return "{}{}".format(each, replacement)
                return str(match)

        self.text = re.sub(regex, partial(replace_item, val=each,
                           strip=strip, repl=replacement), self.text)

    def add_line_breaks_for_numbered_list_with_periods(self):
        if ('♨' in self.text) and (not re.search(
                '♨.+(\n|\r).+♨', self.text)) and (not re.search(
                    r'for\s\d{1,2}♨\s[a-z]', self.text)):
            self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule,

    def replace_parens_in_numbered_list(self):
        self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝')

    def add_line_breaks_for_numbered_list_with_parens(self):
        if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text):
            self.text = Text(self.text).apply(

    def replace_alphabet_list(self, a):
        Input: 'a. ffegnog b. fgegkl c.'
        Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯

        def replace_letter_period(match, val=None):
            match = match.group()
            match_wo_period = match.strip('.')
            if match_wo_period == val:
                return '\r{}∯'.format(match_wo_period)
                return match

                     partial(replace_letter_period, val=a),
                     self.text, flags=re.IGNORECASE)
        return txt

    def replace_alphabet_list_parens(self, a):
        Input: "a) ffegnog (b) fgegkl c)"
        Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)"

        def replace_alphabet_paren(match, val=None):
            match = match.group()
            if '(' in match:
                match_wo_paren = match.strip('(')
                if match_wo_paren == val:
                    return '\r&✂&{}'.format(match_wo_paren)
                    return match
                if match == val:
                    return '\r{}'.format(match)
                    return match

        # Make it cases-insensitive
                     partial(replace_alphabet_paren, val=a),
                     self.text, flags=re.IGNORECASE)
        return txt

    def replace_correct_alphabet_list(self, a, parens):
        if parens:
            a = self.replace_alphabet_list_parens(a)
            a = self.replace_alphabet_list(a)
        return a

    def last_array_item_replacement(self, a, i, alphabet, list_array, parens):
        if (len(alphabet) == 0) & (len(list_array) == 0) or (
                list_array[i - 1] not in alphabet) or (a not in alphabet):
            return self.text
        if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
            return self.text
        result = self.replace_correct_alphabet_list(a, parens)
        return result

    def other_items_replacement(self, a, i, alphabet, list_array, parens):
        if (len(alphabet) == 0) & (len(list_array) == 0) or (
                list_array[i - 1] not in alphabet) or (a not in alphabet) or (
                    list_array[i + 1] not in alphabet):
            return self.text
        if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \
                abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1:
            return self.text
        result = self.replace_correct_alphabet_list(a, parens)
        return result

    def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False):
        list_array = re.findall(regex, self.text)
        alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS
        list_array = [i for i in list_array if i in alphabet]
        for ind, each in enumerate(list_array):
            if ind == len(list_array) - 1:
                self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens)
                self.text = self.other_items_replacement(
                    each, ind, alphabet, list_array, parens)
        return self.text