Пример #1
0
class DoublePunctuationRules:
    FirstRule = Rule(r"\?!", r'☉')
    SecondRule = Rule(r"!\?", r'☈')
    ThirdRule = Rule(r"\?\?", r'☇')
    ForthRule = Rule(r"!!", r'☄')

    All = [FirstRule, SecondRule, ThirdRule, ForthRule]
Пример #2
0
        class EscapeRegexReservedCharacters:
            LeftParen = Rule('(', r'\(')
            RightParen = Rule(')', r'\)')
            LeftBracket = Rule('[', r'\[')
            RightBracket = Rule(']', r'\]')
            Dash = Rule('-', r'\-')

            All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
Пример #3
0
class SingleLetterAbbreviationRules:
    # Rubular: http://rubular.com/r/e3H6kwnr6H
    SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=,?\s)",
                                                  '∯')

    # Rubular: http://rubular.com/r/gitvf0YWH4
    SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯')

    All = [SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule]
Пример #4
0
class HTML:
    # Rubular: http://rubular.com/r/9d0OVOEJWj
    HTMLTagRule = Rule(
        r"""/<\/?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[\^'">\s]+))?)+\s*|\s*)\/?>""",
        '')

    # Rubular: http://rubular.com/r/XZVqMPJhea
    EscapedHTMLTagRule = Rule(r"/&lt;\/?[^gt;]*gt;", '')

    All = [HTMLTagRule, EscapedHTMLTagRule]
Пример #5
0
class ExclamationPointRules:
    # Rubular: http://rubular.com/r/XS1XXFRfM2
    InQuotationRule = Rule(r"\!(?=(\'|\"))", r'&ᓴ&')

    # Rubular: http://rubular.com/r/sl57YI8LkA
    BeforeCommaMidSentenceRule = Rule(r"\!(?=\,\s[a-z])", r'&ᓴ&')

    # Rubular: http://rubular.com/r/f9zTjmkIPb
    MidSentenceRule = Rule(r"\!(?=\s[a-z])", r'&ᓴ&')

    All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
Пример #6
0
class ReinsertEllipsisRules:
    SubThreeConsecutivePeriod = Rule(r"ƪ", r'...')
    SubThreeSpacePeriod = Rule(r"♟", r' . . . ')
    SubFourSpacePeriod = Rule(r"♝", r'. . . .')
    SubTwoConsecutivePeriod = Rule(r"☏", r'..')
    SubOnePeriod = Rule(r"∮", r'.')

    All = [
        SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod,
        SubTwoConsecutivePeriod, SubOnePeriod
    ]
Пример #7
0
        class SubEscapedRegexReservedCharacters:
            SubLeftParen = Rule(r'\(', '(')
            SubRightParen = Rule(r'\)', ')')
            SubLeftBracket = Rule(r'\[', '[')
            SubRightBracket = Rule(r'\]', ']')
            SubDash = Rule(r'\-', '-')

            All = [
                SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket,
                SubDash
            ]
Пример #8
0
class AmPmRules:
    # Rubular: http://rubular.com/r/Vnx3m4Spc8
    UpperCasePmRule = Rule(r"(?<=P∯M)∯(?=\s[A-Z])", '.')

    # Rubular: http://rubular.com/r/AJMCotJVbW
    UpperCaseAmRule = Rule(r"(?<=A∯M)∯(?=\s[A-Z])", '.')

    # Rubular: http://rubular.com/r/13q7SnOhgA
    LowerCasePmRule = Rule(r"(?<=p∯m)∯(?=\s[A-Z])", '.')

    # Rubular: http://rubular.com/r/DgUDq4mLz5
    LowerCaseAmRule = Rule(r"(?<=a∯m)∯(?=\s[A-Z])", '.')

    All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
Пример #9
0
class EllipsisRules:
    # Rubular: http://rubular.com/r/i60hCK81fz
    ThreeConsecutiveRule = Rule(r"\.\.\.(?=\s+[A-Z])", '☏.')

    # Rubular: http://rubular.com/r/Hdqpd90owl
    FourConsecutiveRule = Rule(r"(?<=\S)\.{3}(?=\.\s[A-Z])", 'ƪ')

    # Rubular: http://rubular.com/r/YBG1dIHTRu
    ThreeSpaceRule = Rule(r"(\s\.){3}\s", '♟')

    # Rubular: http://rubular.com/r/2VvZ8wRbd8
    FourSpaceRule = Rule(r"(?<=[a-z])(\.\s){3}\.(|$|\n)", '♝')

    OtherThreePeriodRule = Rule(r"\.\.\.", 'ƪ')

    All = [
        ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule,
        ThreeConsecutiveRule, OtherThreePeriodRule
    ]
Пример #10
0
class Numbers:
    # Rubular: http://rubular.com/r/oNyxBOqbyy
    PeriodBeforeNumberRule = Rule(r"\.(?=\d)", '∯')

    # Rubular: http://rubular.com/r/EMk5MpiUzt
    NumberAfterPeriodBeforeLetterRule = Rule(r"(?<=\d)\.(?=\S)", '∯')

    # Rubular: http://rubular.com/r/rf4l1HjtjG
    NewLineNumberPeriodSpaceLetterRule = Rule(r"(?<=\r\d)\.(?=(\s\S)|\))", '∯')

    # Rubular: http://rubular.com/r/HPa4sdc6b9
    StartLineNumberPeriodRule = Rule(r"(?<=^\d)\.(?=(\s\S)|\))", '∯')

    # Rubular: http://rubular.com/r/NuvWnKleFl
    StartLineTwoDigitNumberPeriodRule = Rule(r"(?<=^\d\d)\.(?=(\s\S)|\))", '∯')

    All = [
        PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule,
        NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule,
        StartLineTwoDigitNumberPeriodRule
    ]
Пример #11
0
class SubSymbolsRules:
    Period = Rule(r'∯', '.')
    ArabicComma = Rule(r"♬", r'،')
    SemiColon = Rule(r"♭", r':')
    FullWidthPeriod = Rule(r"&ᓰ&", r'。')
    SpecialPeriod = Rule(r"&ᓱ&", r'.')
    FullWidthExclamation = Rule(r"&ᓳ&", r'!')
    ExclamationPoint = Rule(r"&ᓴ&", r'!')
    QuestionMark = Rule(r"&ᓷ&", r'?')
    FullWidthQuestionMark = Rule(r"&ᓸ&", r'?')
    MixedDoubleQE = Rule(r"☉", r'?!')
    MixedDoubleQQ = Rule(r"☇", r'??')
    MixedDoubleEQ = Rule(r"☈", r'!?')
    MixedDoubleEE = Rule(r"☄", r'!!')
    LeftParens = Rule(r"&✂&", r'(')
    RightParens = Rule(r"&⌬&", r')')
    TemporaryEndingPunctutation = Rule('ȸ', '')
    Newline = Rule(r"ȹ", r"\n")

    All = [
        Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod,
        FullWidthExclamation, ExclamationPoint, QuestionMark,
        FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ,
        MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation,
        Newline
    ]
Пример #12
0
class Abbreviations:
    # Rubular: http://rubular.com/r/EUbZCNfgei
    WithMultiplePeriodsAndEmailRule = Rule(r"(\w)(\.)(\w)", r'\1∮\3')
Пример #13
0
    PREPOSITIVE_ABBREVIATIONS = {
        'adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr',
        'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs',
        'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt',
        'st', 'supt', 'v', 'vs'
    }
    NUMBER_ABBREVIATIONS = {'art', 'ext', 'no', 'nos', 'p', 'pp'}


class Abbreviations:
    # Rubular: http://rubular.com/r/EUbZCNfgei
    WithMultiplePeriodsAndEmailRule = Rule(r"(\w)(\.)(\w)", r'\1∮\3')


# Rubular: http://rubular.com/r/G2opjedIm9
GeoLocationRule = Rule(r"(?<=[a-zA-z]°)\.(?=\s*\d+)", r'∯')
FileFormatRule = Rule(
    r"(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)",
    r'∯')
SingleNewLineRule = Rule(r"\n", r'ȹ')


class DoublePunctuationRules:
    FirstRule = Rule(r"\?!", r'☉')
    SecondRule = Rule(r"!\?", r'☈')
    ThirdRule = Rule(r"\?\?", r'☇')
    ForthRule = Rule(r"!!", r'☄')

    All = [FirstRule, SecondRule, ThirdRule, ForthRule]

Пример #14
0
class PDF:
    # Rubular: http://rubular.com/r/UZAVcwqck8
    NewLineInMiddleOfSentenceRule = Rule(r"/(?<=[^\n]\s)\n(?=\S)", '')

    # Rubular: http://rubular.com/r/eaNwGavmdo
    NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"/\n(?=[a-z])", ' ')
Пример #15
0
from pygmatic_segmenter.types import Rule

# Rubular: http://rubular.com/r/V57WnM9Zut
NewLineInMiddleOfWordRule = Rule(r"/\n(?=[a-zA-Z]{1,2}\n)", '')

# Rubular: http://rubular.com/r/dMxp5MixFS
DoubleNewLineWithSpaceRule = Rule(r"/\n \n", "\r")

# Rubular: http://rubular.com/r/H6HOJeA8bq
DoubleNewLineRule = Rule(r"/\n\n", "\r")

# Rubular: http://rubular.com/r/FseyMiiYFT
NewLineFollowedByPeriodRule = Rule(r"/\n(?=\.(\s|\n))", '')

ReplaceNewlineWithCarriageReturnRule = Rule(r"/\n", "\r")

EscapedNewLineRule = Rule(r"/\\n", "\n")
EscapedCarriageReturnRule = Rule(r"/\\r", "\r")

TypoEscapedNewLineRule = Rule(r"/\\\ n", "\n")

TypoEscapedCarriageReturnRule = Rule(r"/\\\ r", "\r")

# Rubular: http://rubular.com/r/bAJrhyLNeZ
InlineFormattingRule = Rule(r"/\{b\^&gt;\d*&lt;b\^\}|\{b\^>\d*<b\^\}", '')

# Rubular: http://rubular.com/r/8mc1ArOIGy
TableOfContentsRule = Rule(r"/\.{5,}\s*\d+-*\d*", "\r")

# Rubular: http://rubular.com/r/DwNSuZrNtk
ConsecutivePeriodsRule = Rule(r"/\.{5,}", ' ')