class DoublePunctuationRules: FirstRule = Rule(r"\?!", r'☉') SecondRule = Rule(r"!\?", r'☈') ThirdRule = Rule(r"\?\?", r'☇') ForthRule = Rule(r"!!", r'☄') All = [FirstRule, SecondRule, ThirdRule, ForthRule]
class EscapeRegexReservedCharacters: LeftParen = Rule('(', r'\(') RightParen = Rule(')', r'\)') LeftBracket = Rule('[', r'\[') RightBracket = Rule(']', r'\]') Dash = Rule('-', r'\-') All = [LeftParen, RightParen, LeftBracket, RightBracket, Dash]
class SingleLetterAbbreviationRules: # Rubular: http://rubular.com/r/e3H6kwnr6H SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=,?\s)", '∯') # Rubular: http://rubular.com/r/gitvf0YWH4 SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯') All = [SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule]
class HTML: # Rubular: http://rubular.com/r/9d0OVOEJWj HTMLTagRule = Rule( r"""/<\/?\w+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[\^'">\s]+))?)+\s*|\s*)\/?>""", '') # Rubular: http://rubular.com/r/XZVqMPJhea EscapedHTMLTagRule = Rule(r"/<\/?[^gt;]*gt;", '') All = [HTMLTagRule, EscapedHTMLTagRule]
class ExclamationPointRules: # Rubular: http://rubular.com/r/XS1XXFRfM2 InQuotationRule = Rule(r"\!(?=(\'|\"))", r'&ᓴ&') # Rubular: http://rubular.com/r/sl57YI8LkA BeforeCommaMidSentenceRule = Rule(r"\!(?=\,\s[a-z])", r'&ᓴ&') # Rubular: http://rubular.com/r/f9zTjmkIPb MidSentenceRule = Rule(r"\!(?=\s[a-z])", r'&ᓴ&') All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule]
class ReinsertEllipsisRules: SubThreeConsecutivePeriod = Rule(r"ƪ", r'...') SubThreeSpacePeriod = Rule(r"♟", r' . . . ') SubFourSpacePeriod = Rule(r"♝", r'. . . .') SubTwoConsecutivePeriod = Rule(r"☏", r'..') SubOnePeriod = Rule(r"∮", r'.') All = [ SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod, SubTwoConsecutivePeriod, SubOnePeriod ]
class SubEscapedRegexReservedCharacters: SubLeftParen = Rule(r'\(', '(') SubRightParen = Rule(r'\)', ')') SubLeftBracket = Rule(r'\[', '[') SubRightBracket = Rule(r'\]', ']') SubDash = Rule(r'\-', '-') All = [ SubLeftParen, SubRightParen, SubLeftBracket, SubRightBracket, SubDash ]
class AmPmRules: # Rubular: http://rubular.com/r/Vnx3m4Spc8 UpperCasePmRule = Rule(r"(?<=P∯M)∯(?=\s[A-Z])", '.') # Rubular: http://rubular.com/r/AJMCotJVbW UpperCaseAmRule = Rule(r"(?<=A∯M)∯(?=\s[A-Z])", '.') # Rubular: http://rubular.com/r/13q7SnOhgA LowerCasePmRule = Rule(r"(?<=p∯m)∯(?=\s[A-Z])", '.') # Rubular: http://rubular.com/r/DgUDq4mLz5 LowerCaseAmRule = Rule(r"(?<=a∯m)∯(?=\s[A-Z])", '.') All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule]
class EllipsisRules: # Rubular: http://rubular.com/r/i60hCK81fz ThreeConsecutiveRule = Rule(r"\.\.\.(?=\s+[A-Z])", '☏.') # Rubular: http://rubular.com/r/Hdqpd90owl FourConsecutiveRule = Rule(r"(?<=\S)\.{3}(?=\.\s[A-Z])", 'ƪ') # Rubular: http://rubular.com/r/YBG1dIHTRu ThreeSpaceRule = Rule(r"(\s\.){3}\s", '♟') # Rubular: http://rubular.com/r/2VvZ8wRbd8 FourSpaceRule = Rule(r"(?<=[a-z])(\.\s){3}\.(|$|\n)", '♝') OtherThreePeriodRule = Rule(r"\.\.\.", 'ƪ') All = [ ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule, ThreeConsecutiveRule, OtherThreePeriodRule ]
class Numbers: # Rubular: http://rubular.com/r/oNyxBOqbyy PeriodBeforeNumberRule = Rule(r"\.(?=\d)", '∯') # Rubular: http://rubular.com/r/EMk5MpiUzt NumberAfterPeriodBeforeLetterRule = Rule(r"(?<=\d)\.(?=\S)", '∯') # Rubular: http://rubular.com/r/rf4l1HjtjG NewLineNumberPeriodSpaceLetterRule = Rule(r"(?<=\r\d)\.(?=(\s\S)|\))", '∯') # Rubular: http://rubular.com/r/HPa4sdc6b9 StartLineNumberPeriodRule = Rule(r"(?<=^\d)\.(?=(\s\S)|\))", '∯') # Rubular: http://rubular.com/r/NuvWnKleFl StartLineTwoDigitNumberPeriodRule = Rule(r"(?<=^\d\d)\.(?=(\s\S)|\))", '∯') All = [ PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule, NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule, StartLineTwoDigitNumberPeriodRule ]
class SubSymbolsRules: Period = Rule(r'∯', '.') ArabicComma = Rule(r"♬", r'،') SemiColon = Rule(r"♭", r':') FullWidthPeriod = Rule(r"&ᓰ&", r'。') SpecialPeriod = Rule(r"&ᓱ&", r'.') FullWidthExclamation = Rule(r"&ᓳ&", r'!') ExclamationPoint = Rule(r"&ᓴ&", r'!') QuestionMark = Rule(r"&ᓷ&", r'?') FullWidthQuestionMark = Rule(r"&ᓸ&", r'?') MixedDoubleQE = Rule(r"☉", r'?!') MixedDoubleQQ = Rule(r"☇", r'??') MixedDoubleEQ = Rule(r"☈", r'!?') MixedDoubleEE = Rule(r"☄", r'!!') LeftParens = Rule(r"&✂&", r'(') RightParens = Rule(r"&⌬&", r')') TemporaryEndingPunctutation = Rule('ȸ', '') Newline = Rule(r"ȹ", r"\n") All = [ Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod, FullWidthExclamation, ExclamationPoint, QuestionMark, FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ, MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation, Newline ]
class Abbreviations: # Rubular: http://rubular.com/r/EUbZCNfgei WithMultiplePeriodsAndEmailRule = Rule(r"(\w)(\.)(\w)", r'\1∮\3')
PREPOSITIVE_ABBREVIATIONS = { 'adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs' } NUMBER_ABBREVIATIONS = {'art', 'ext', 'no', 'nos', 'p', 'pp'} class Abbreviations: # Rubular: http://rubular.com/r/EUbZCNfgei WithMultiplePeriodsAndEmailRule = Rule(r"(\w)(\.)(\w)", r'\1∮\3') # Rubular: http://rubular.com/r/G2opjedIm9 GeoLocationRule = Rule(r"(?<=[a-zA-z]°)\.(?=\s*\d+)", r'∯') FileFormatRule = Rule( r"(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)", r'∯') SingleNewLineRule = Rule(r"\n", r'ȹ') class DoublePunctuationRules: FirstRule = Rule(r"\?!", r'☉') SecondRule = Rule(r"!\?", r'☈') ThirdRule = Rule(r"\?\?", r'☇') ForthRule = Rule(r"!!", r'☄') All = [FirstRule, SecondRule, ThirdRule, ForthRule]
class PDF: # Rubular: http://rubular.com/r/UZAVcwqck8 NewLineInMiddleOfSentenceRule = Rule(r"/(?<=[^\n]\s)\n(?=\S)", '') # Rubular: http://rubular.com/r/eaNwGavmdo NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"/\n(?=[a-z])", ' ')
from pygmatic_segmenter.types import Rule # Rubular: http://rubular.com/r/V57WnM9Zut NewLineInMiddleOfWordRule = Rule(r"/\n(?=[a-zA-Z]{1,2}\n)", '') # Rubular: http://rubular.com/r/dMxp5MixFS DoubleNewLineWithSpaceRule = Rule(r"/\n \n", "\r") # Rubular: http://rubular.com/r/H6HOJeA8bq DoubleNewLineRule = Rule(r"/\n\n", "\r") # Rubular: http://rubular.com/r/FseyMiiYFT NewLineFollowedByPeriodRule = Rule(r"/\n(?=\.(\s|\n))", '') ReplaceNewlineWithCarriageReturnRule = Rule(r"/\n", "\r") EscapedNewLineRule = Rule(r"/\\n", "\n") EscapedCarriageReturnRule = Rule(r"/\\r", "\r") TypoEscapedNewLineRule = Rule(r"/\\\ n", "\n") TypoEscapedCarriageReturnRule = Rule(r"/\\\ r", "\r") # Rubular: http://rubular.com/r/bAJrhyLNeZ InlineFormattingRule = Rule(r"/\{b\^>\d*<b\^\}|\{b\^>\d*<b\^\}", '') # Rubular: http://rubular.com/r/8mc1ArOIGy TableOfContentsRule = Rule(r"/\.{5,}\s*\d+-*\d*", "\r") # Rubular: http://rubular.com/r/DwNSuZrNtk ConsecutivePeriodsRule = Rule(r"/\.{5,}", ' ')