class EllipsisRules(object): # below rules aren't similar to original rules of pragmatic segmenter # modification: spaces replaced with same number of symbols # Rubular: http://rubular.com/r/i60hCK81fz ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.') # Rubular: http://rubular.com/r/Hdqpd90owl FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ') # Rubular: http://rubular.com/r/YBG1dIHTRu ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟') # Rubular: http://rubular.com/r/2VvZ8wRbd8 FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝') OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ') All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule, ThreeConsecutiveRule, OtherThreePeriodRule]
class Numbers(object): # Rubular: http://rubular.com/r/oNyxBOqbyy PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯') # Rubular: http://rubular.com/r/EMk5MpiUzt NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯') # Rubular: http://rubular.com/r/rf4l1HjtjG NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/HPa4sdc6b9 StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/NuvWnKleFl StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯') All = [ PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule, NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule, StartLineTwoDigitNumberPeriodRule ]
class Persian(Common, Standard): iso_code = 'fa' Punctuations = ['?', '!', ':', '.', '؟'] SENTENCE_BOUNDARY_REGEX = r'.*?[:\.!\?؟]|.*?\Z|.*?$' # Rubular: http://rubular.com/r/RX5HpdDIyv ReplaceColonBetweenNumbersRule = Rule(r'(?<=\d):(?=\d)', '♭') # Rubular: http://rubular.com/r/kPRgApNHUg ReplaceNonSentenceBoundaryCommaRule = Rule(r'،(?=\s\S+،)', '♬') class AbbreviationReplacer(AbbreviationReplacer): SENTENCE_STARTERS = [] def __init__(self, text, lang): super().__init__(text, lang) def scan_for_replacements(self, txt, am, index, character_array): txt = re.sub('(?<={0})\.'.format(am), '∯', txt) return txt
class Abbreviation(object): """Defines the abbreviations for each language (if available)""" ABBREVIATIONS = [ 'adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig' ] PREPOSITIVE_ABBREVIATIONS = [ 'adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig' ] NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp'] # Rubular: http://rubular.com/r/EUbZCNfgei # WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3') # \w in python matches unicode abbreviations also so limit to english alphanumerics WithMultiplePeriodsAndEmailRule = Rule( r'([a-zA-Z0-9_])(\.)([a-zA-Z0-9_])', '\\1∮\\3')
class Standard(object): # This class holds the punctuation marks. Punctuations = ['。', '.', '.', '!', '!', '?', '?'] # Rubular: http://rubular.com/r/G2opjedIm9 GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯') FileFormatRule = Rule( r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯') SingleNewLineRule = Rule(r'\n', 'ȹ') # Rubular: http://rubular.com/r/aXPUGm6fQh QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&') ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ') SubSingleQuoteRule = Rule(r'&⎋&', "'")
class SubSymbolsRules(object): Period = Rule(r'∯', '.') ArabicComma = Rule(r'♬', '،') SemiColon = Rule(r'♭', ':') FullWidthPeriod = Rule(r'&ᓰ&', '。') SpecialPeriod = Rule(r'&ᓱ&', '.') FullWidthExclamation = Rule(r'&ᓳ&', '!') ExclamationPoint = Rule(r'&ᓴ&', '!') QuestionMark = Rule(r'&ᓷ&', '?') FullWidthQuestionMark = Rule(r'&ᓸ&', '?') MixedDoubleQE = Rule(r'☉', '?!') MixedDoubleQQ = Rule(r'☇', '??') MixedDoubleEQ = Rule(r'☈', '!?') MixedDoubleEE = Rule(r'☄', '!!') LeftParens = Rule(r'&✂&', '(') RightParens = Rule(r'&⌬&', ')') TemporaryEndingPunctutation = Rule(r'ȸ', '') Newline = Rule(r'ȹ', "\n") All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod, FullWidthExclamation, ExclamationPoint, QuestionMark, FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ, MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation, Newline]
class Standard: # This class holds the punctuation marks. Punctuations = ['。', '.', '.', '!', '!', '?', '?'] # Rubular: http://rubular.com/r/G2opjedIm9 GeoLocationRule = Rule(r'(?<=[a-zA-z]°)\.(?=\s*\d+)', '∯') FileFormatRule = Rule(r'(?<=\s)\.(?=(jpe?g|png|gif|tiff?|pdf|ps|docx?|xlsx?|svg|bmp|tga|exif|odt|html?|txt|rtf|bat|sxw|xml|zip|exe|msi|blend|wmv|mp[34]|pptx?|flac|rb|cpp|cs|js)\s)', '∯') SingleNewLineRule = Rule(r'\n', 'ȹ') # Rubular: http://rubular.com/r/aXPUGm6fQh QuestionMarkInQuotationRule = Rule(r'\?(?=(\'|\"))', '&ᓷ&') ExtraWhiteSpaceRule = Rule(r'\s{3,}', ' ') SubSingleQuoteRule = Rule(r'&⎋&', "'") class Abbreviation(object): """Defines the abbreviations for each language (if available)""" ABBREVIATIONS = ['adj', 'adm', 'adv', 'al', 'ala', 'alta', 'apr', 'arc', 'ariz', 'ark', 'art', 'assn', 'asst', 'attys', 'aug', 'ave', 'bart', 'bld', 'bldg', 'blvd', 'brig', 'bros', 'btw', 'cal', 'calif', 'capt', 'cl', 'cmdr', 'co', 'col', 'colo', 'comdr', 'con', 'conn', 'corp', 'cpl', 'cres', 'ct', 'd.phil', 'dak', 'dec', 'del', 'dept', 'det', 'dist', 'dr', 'dr.phil', 'dr.philos', 'drs', 'e.g', 'ens', 'esp', 'esq', 'etc', 'exp', 'expy', 'ext', 'feb', 'fed', 'fla', 'ft', 'fwy', 'fy', 'ga', 'gen', 'gov', 'hon', 'hosp', 'hr', 'hway', 'hwy', 'i.e', 'ia', 'id', 'ida', 'ill', 'inc', 'ind', 'ing', 'insp', 'is', 'jan', 'jr', 'jul', 'jun', 'kan', 'kans', 'ken', 'ky', 'la', 'lt', 'ltd', 'maj', 'man', 'mar', 'mass', 'may', 'md', 'me', 'med', 'messrs', 'mex', 'mfg', 'mich', 'min', 'minn', 'miss', 'mlle', 'mm', 'mme', 'mo', 'mont', 'mr', 'mrs', 'ms', 'msgr', 'mssrs', 'mt', 'mtn', 'neb', 'nebr', 'nev', 'no', 'nos', 'nov', 'nr', 'oct', 'ok', 'okla', 'ont', 'op', 'ord', 'ore', 'p', 'pa', 'pd', 'pde', 'penn', 'penna', 'pfc', 'ph', 'ph.d', 'pl', 'plz', 'pp', 'prof', 'pvt', 'que', 'rd', 'rs', 'ref', 'rep', 'reps', 'res', 'rev', 'rt', 'sask', 'sec', 'sen', 'sens', 'sep', 'sept', 'sfc', 'sgt', 'sr', 'st', 'supt', 'surg', 'tce', 'tenn', 'tex', 'univ', 'usafa', 'u.s', 'ut', 'va', 'v', 'ver', 'viz', 'vs', 'vt', 'wash', 'wis', 'wisc', 'wy', 'wyo', 'yuk', 'fig'] PREPOSITIVE_ABBREVIATIONS = ['adm', 'attys', 'brig', 'capt', 'cmdr', 'col', 'cpl', 'det', 'dr', 'gen', 'gov', 'ing', 'lt', 'maj', 'mr', 'mrs', 'ms', 'mt', 'messrs', 'mssrs', 'prof', 'ph', 'rep', 'reps', 'rev', 'sen', 'sens', 'sgt', 'st', 'supt', 'v', 'vs', 'fig'] NUMBER_ABBREVIATIONS = ['art', 'ext', 'no', 'nos', 'p', 'pp'] # Part of "Abbreviations" ruby module # Rubular: http://rubular.com/r/EUbZCNfgei WithMultiplePeriodsAndEmailRule = Rule(r'(\w)(\.)(\w)', '\\1∮\\3') class DoublePunctuationRules(object): FirstRule = Rule(r'\?!', '☉') SecondRule = Rule(r'!\?', '☈') ThirdRule = Rule(r'\?\?', '☇') ForthRule = Rule(r'!!', '☄') DoublePunctuation = r'\?!|!\?|\?\?|!!' All = [FirstRule, SecondRule, ThirdRule, ForthRule] class ExclamationPointRules(object): # Rubular: http://rubular.com/r/XS1XXFRfM2 InQuotationRule = Rule(r'\!(?=(\'|\"))', '&ᓴ&') # Rubular: http://rubular.com/r/sl57YI8LkA BeforeCommaMidSentenceRule = Rule(r'\!(?=\,\s[a-z])', '&ᓴ&') # Rubular: http://rubular.com/r/f9zTjmkIPb MidSentenceRule = Rule(r'\!(?=\s[a-z])', '&ᓴ&') All = [InQuotationRule, BeforeCommaMidSentenceRule, MidSentenceRule] class SubSymbolsRules(object): Period = Rule(r'∯', '.') ArabicComma = Rule(r'♬', '،') SemiColon = Rule(r'♭', ':') FullWidthPeriod = Rule(r'&ᓰ&', '。') SpecialPeriod = Rule(r'&ᓱ&', '.') FullWidthExclamation = Rule(r'&ᓳ&', '!') ExclamationPoint = Rule(r'&ᓴ&', '!') QuestionMark = Rule(r'&ᓷ&', '?') FullWidthQuestionMark = Rule(r'&ᓸ&', '?') MixedDoubleQE = Rule(r'☉', '?!') MixedDoubleQQ = Rule(r'☇', '??') MixedDoubleEQ = Rule(r'☈', '!?') MixedDoubleEE = Rule(r'☄', '!!') LeftParens = Rule(r'&✂&', '(') RightParens = Rule(r'&⌬&', ')') TemporaryEndingPunctutation = Rule(r'ȸ', '') Newline = Rule(r'ȹ', "\n") All = [Period, ArabicComma, SemiColon, FullWidthPeriod, SpecialPeriod, FullWidthExclamation, ExclamationPoint, QuestionMark, FullWidthQuestionMark, MixedDoubleQE, MixedDoubleQQ, MixedDoubleEQ, MixedDoubleEE, LeftParens, RightParens, TemporaryEndingPunctutation, Newline] class EllipsisRules(object): # below rules aren't similar to original rules of pragmatic segmenter # modification: spaces replaced with same number of symbols # Rubular: http://rubular.com/r/i60hCK81fz ThreeConsecutiveRule = Rule(r'\.\.\.(?=\s+[A-Z])', '☏☏.') # Rubular: http://rubular.com/r/Hdqpd90owl FourConsecutiveRule = Rule(r'(?<=\S)\.{3}(?=\.\s[A-Z])', 'ƪƪƪ') # Rubular: http://rubular.com/r/YBG1dIHTRu ThreeSpaceRule = Rule(r'(\s\.){3}\s', '♟♟♟♟♟♟♟') # Rubular: http://rubular.com/r/2VvZ8wRbd8 FourSpaceRule = Rule(r'(?<=[a-z])(\.\s){3}\.($|\\n)', '♝♝♝♝♝♝♝') OtherThreePeriodRule = Rule(r'\.\.\.', 'ƪƪƪ') All = [ThreeSpaceRule, FourSpaceRule, FourConsecutiveRule, ThreeConsecutiveRule, OtherThreePeriodRule] class ReinsertEllipsisRules(object): # below rules aren't similar to original rules of pragmatic segmenter # modification: symbols replaced with same number of ellipses SubThreeConsecutivePeriod = Rule(r'ƪƪƪ', '...') SubThreeSpacePeriod = Rule(r'♟♟♟♟♟♟♟', ' . . . ') SubFourSpacePeriod = Rule(r'♝♝♝♝♝♝♝', '. . . .') SubTwoConsecutivePeriod = Rule(r'☏☏', '..') SubOnePeriod = Rule(r'∮', '.') All = [SubThreeConsecutivePeriod, SubThreeSpacePeriod, SubFourSpacePeriod, SubTwoConsecutivePeriod, SubOnePeriod] class AbbreviationReplacer(AbbreviationReplacer): SENTENCE_STARTERS = "A Being Did For He How However I In It Millions "\ "More She That The There They We What When Where Who Why".split(" ")
class PDF(object): # Rubular: http://rubular.com/r/UZAVcwqck8 NewLineInMiddleOfSentenceRule = Rule(r'(?<=[^\n]\s)\n(?=\S)', '') # Rubular: http://rubular.com/r/eaNwGavmdo NewLineInMiddleOfSentenceNoSpacesRule = Rule(r"\n(?=[a-z])", ' ')
class CleanRules(object): # NOTE: Caution: Might require \\ for special characters # if regex is defined with r'' then dont # add extra \\ for special characters # Rubular: http://rubular.com/r/V57WnM9Zut NewLineInMiddleOfWordRule = Rule(r'\n(?=[a-zA-Z]{1,2}\n)', '') # Rubular: http://rubular.com/r/dMxp5MixFS DoubleNewLineWithSpaceRule = Rule(r'\n \n', "\r") # Rubular: http://rubular.com/r/H6HOJeA8bq DoubleNewLineRule = Rule(r'\n\n', "\r") # Rubular: http://rubular.com/r/FseyMiiYFT NewLineFollowedByPeriodRule = Rule(r'\n(?=\.(\s|\n))', '') ReplaceNewlineWithCarriageReturnRule = Rule(r'\n', "\r") EscapedNewLineRule = Rule(r'\\n', "\n") EscapedCarriageReturnRule = Rule(r'\\r', "\r") TypoEscapedNewLineRule = Rule(r'\\\ n', "\n") TypoEscapedCarriageReturnRule = Rule(r'\\\ r', "\r") # Rubular: http://rubular.com/r/bAJrhyLNeZ InlineFormattingRule = Rule(r'{b\^>\d*<b\^}|{b\^>\d*<b\^}', '') # Rubular: http://rubular.com/r/8mc1ArOIGy TableOfContentsRule = Rule(r'\.{4,}\s*\d+-*\d*', "\r") # Rubular: http://rubular.com/r/DwNSuZrNtk ConsecutivePeriodsRule = Rule(r'\.{5,}', ' ') # Rubular: http://rubular.com/r/IQ4TPfsbd8 ConsecutiveForwardSlashRule = Rule(r'\/{3}', '') # Rubular: http://rubular.com/r/6dt98uI76u NO_SPACE_BETWEEN_SENTENCES_REGEX = r'(?<=[a-z])\.(?=[A-Z])' # NO_SPACE_BETWEEN_SENTENCES_REGEX = r'[a-z]\.[A-Z]' NoSpaceBetweenSentencesRule = Rule(NO_SPACE_BETWEEN_SENTENCES_REGEX, '. ') # Rubular: http://rubular.com/r/l6KN6rH5XE NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX = r'(?<=\d)\.(?=[A-Z])' NoSpaceBetweenSentencesDigitRule = Rule(NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX, '. ') URL_EMAIL_KEYWORDS = ['@', 'http', '.com', 'net', 'www', '//'] # Rubular: http://rubular.com/r/3GiRiP2IbD NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEX = r'(?<=\s)\n(?=([a-z]|\())' # Rubular: http://rubular.com/r/Gn18aAnLdZ NewLineFollowedByBulletRule = Rule(r"\n(?=•')", "\r") QuotationsFirstRule = Rule(r"''", '"') QuotationsSecondRule = Rule(r'``', '"')
class Common(object): # added special case: r"[。..!!? ]{2,}" to handle intermittent dots, exclamation, etc. # r"[。..!!?] at end to handle single instances of these symbol inputs SENTENCE_BOUNDARY_REGEX = r"((?:[^)])*)(?=\s?[A-Z])|「(?:[^」])*」(?=\s[A-Z])|\((?:[^\)]){2,}\)(?=\s[A-Z])|\'(?:[^\'])*[^,]\'(?=\s[A-Z])|\"(?:[^\"])*[^,]\"(?=\s[A-Z])|\“(?:[^\”])*[^,]\”(?=\s[A-Z])|[。..!!?? ]{2,}|\S.*?[。..!!??ȸȹ☉☈☇☄]|[。..!!??]" # # Rubular: http://rubular.com/r/NqCqv372Ix QUOTATION_AT_END_OF_SENTENCE_REGEX = r'[!?\.-][\"\'“”]\s{1}[A-Z]' # # Rubular: http://rubular.com/r/6flGnUMEVl PARENS_BETWEEN_DOUBLE_QUOTES_REGEX = r'["\”]\s\(.*\)\s["\“]' # # Rubular: http://rubular.com/r/TYzr4qOW1Q # BETWEEN_DOUBLE_QUOTES_REGEX = / "(?:[^"])*[^, ]"|“(?: [ ^”])*[^, ]”/ # # Rubular: http://rubular.com/r/JMjlZHAT4g SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX = r'(?<=[!?\.-][\"\'“”])\s{1}(?=[A-Z])' # # Rubular: http://rubular.com/r/mQ8Es9bxtk CONTINUOUS_PUNCTUATION_REGEX = r'(?<=\S)(!|\?){3,}(?=(\s|\Z|$))' # https://rubular.com/r/UkumQaILKbkeyc # https://github.com/diasks2/pragmatic_segmenter/commit/d9ec1a352aff92b91e2e572c30bb9561eb42c703 NUMBERED_REFERENCE_REGEX = r'(?<=[^\d\s])(\.|∯)((\[(\d{1,3},?\s?-?\s?)*\b\d{1,3}\])+|((\d{1,3}\s?)?\d{1,3}))(\s)(?=[A-Z])' # # Rubular: http://rubular.com/r/yqa4Rit8EY PossessiveAbbreviationRule = Rule(r"\.(?='s\s)|\.(?='s$)|\.(?='s\Z)", '∯') # # Rubular: http://rubular.com/r/NEv265G2X2 KommanditgesellschaftRule = Rule(r'(?<=Co)\.(?=\sKG)', '∯') # # Rubular: http://rubular.com/r/xDkpFZ0EgH MULTI_PERIOD_ABBREVIATION_REGEX = r"\b[a-z](?:\.[a-z])+[.]" class SingleLetterAbbreviationRules(object): """Searches for periods within an abbreviation and replaces the periods. """ # Rubular: http://rubular.com/r/e3H6kwnr6H SingleUpperCaseLetterAtStartOfLineRule = Rule(r"(?<=^[A-Z])\.(?=\s)", '∯') # Rubular: http://rubular.com/r/gitvf0YWH4 SingleUpperCaseLetterRule = Rule(r"(?<=\s[A-Z])\.(?=,?\s)", '∯') All = [ SingleUpperCaseLetterAtStartOfLineRule, SingleUpperCaseLetterRule ] class AmPmRules(object): # Rubular: http://rubular.com/r/Vnx3m4Spc8 UpperCasePmRule = Rule(r'(?<= P∯M)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/AJMCotJVbW UpperCaseAmRule = Rule(r'(?<=A∯M)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/13q7SnOhgA LowerCasePmRule = Rule(r'(?<=p∯m)∯(?=\s[A-Z])', '.') # Rubular: http://rubular.com/r/DgUDq4mLz5 LowerCaseAmRule = Rule(r'(?<=a∯m)∯(?=\s[A-Z])', '.') All = [UpperCasePmRule, UpperCaseAmRule, LowerCasePmRule, LowerCaseAmRule] class Numbers(object): # Rubular: http://rubular.com/r/oNyxBOqbyy PeriodBeforeNumberRule = Rule(r'\.(?=\d)', '∯') # Rubular: http://rubular.com/r/EMk5MpiUzt NumberAfterPeriodBeforeLetterRule = Rule(r'(?<=\d)\.(?=\S)', '∯') # Rubular: http://rubular.com/r/rf4l1HjtjG NewLineNumberPeriodSpaceLetterRule = Rule(r'(?<=\r\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/HPa4sdc6b9 StartLineNumberPeriodRule = Rule(r'(?<=^\d)\.(?=(\s\S)|\))', '∯') # Rubular: http://rubular.com/r/NuvWnKleFl StartLineTwoDigitNumberPeriodRule = Rule(r'(?<=^\d\d)\.(?=(\s\S)|\))', '∯') All = [ PeriodBeforeNumberRule, NumberAfterPeriodBeforeLetterRule, NewLineNumberPeriodSpaceLetterRule, StartLineNumberPeriodRule, StartLineTwoDigitNumberPeriodRule ]
def remove_newline_in_middle_of_word(self): NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '') self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)
class ListItemReplacer(object): ROMAN_NUMERALS = "i ii iii iv v vi vii viii ix x xi xii xiii xiv x xi xii xiii xv xvi xvii xviii xix xx".split(' ') LATIN_NUMERALS = list(string.ascii_lowercase) # Rubular: http://rubular.com/r/XcpaJKH0sz ALPHABETICAL_LIST_WITH_PERIODS = r'(?<=^)[a-z](?=\.)|(?<=\A)[a-z](?=\.)|(?<=\s)[a-z](?=\.)' # Rubular: http://rubular.com/r/Gu5rQapywf # TODO: Make sure below regex call is case-insensitive ALPHABETICAL_LIST_WITH_PARENS = r'(?<=\()[a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))' # (pattern, replacement) SubstituteListPeriodRule = Rule('♨', '∯') ListMarkerRule = Rule('☝', '') # Rubular: http://rubular.com/r/Wv4qLdoPx7 # https://regex101.com/r/62YBlv/1 SpaceBetweenListItemsFirstRule = Rule(r'(?<=\S\S)\s(?=\S\s*\d+♨)', "\r") # Rubular: http://rubular.com/r/AizHXC6HxK # https://regex101.com/r/62YBlv/2 SpaceBetweenListItemsSecondRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}♨)', "\r") # Rubular: http://rubular.com/r/GE5q6yID2j # https://regex101.com/r/62YBlv/3 SpaceBetweenListItemsThirdRule = Rule(r'(?<=\S\S)\s(?=\d{1,2}☝)', "\r") NUMBERED_LIST_REGEX_1 = r'\s\d{1,2}(?=\.\s)|^\d{1,2}(?=\.\s)|\s\d{1,2}(?=\.\))|^\d{1,2}(?=\.\))|(?<=\s\-)\d{1,2}(?=\.\s)|(?<=^\-)\d{1,2}(?=\.\s)|(?<=\s\⁃)\d{1,2}(?=\.\s)|(?<=^\⁃)\d{1,2}(?=\.\s)|(?<=s\-)\d{1,2}(?=\.\))|(?<=^\-)\d{1,2}(?=\.\))|(?<=\s\⁃)\d{1,2}(?=\.\))|(?<=^\⁃)\d{1,2}(?=\.\))' # 1. abcd # 2. xyz NUMBERED_LIST_REGEX_2 = r'(?<=\s)\d{1,2}\.(?=\s)|^\d{1,2}\.(?=\s)|(?<=\s)\d{1,2}\.(?=\))|^\d{1,2}\.(?=\))|(?<=\s\-)\d{1,2}\.(?=\s)|(?<=^\-)\d{1,2}\.(?=\s)|(?<=\s\⁃)\d{1,2}\.(?=\s)|(?<=^\⁃)\d{1,2}\.(?=\s)|(?<=\s\-)\d{1,2}\.(?=\))|(?<=^\-)\d{1,2}\.(?=\))|(?<=\s\⁃)\d{1,2}\.(?=\))|(?<=^\⁃)\d{1,2}\.(?=\))' # 1) abcd # 2) xyz NUMBERED_LIST_PARENS_REGEX = r'\d{1,2}(?=\)\s)' # Rubular: http://rubular.com/r/NsNFSqrNvJ # TODO: Make sure below regex call is case-insensitive EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX = r'\([a-z]+(?=\))|(?<=^)[a-z]+(?=\))|(?<=\A)[a-z]+(?=\))|(?<=\s)[a-z]+(?=\))' # Rubular: http://rubular.com/r/wMpnVedEIb # TODO: Make sure below regex call is case-insensitive ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX = r'(?<=^)[a-z]\.|(?<=\A)[a-z]\.|(?<=\s)[a-z]\.' # Rubular: http://rubular.com/r/GcnmQt4a3I ROMAN_NUMERALS_IN_PARENTHESES = r'\(((?=[mdclxvi])m*(c[md]|d?c*)(x[cl]|l?x*)(i[xv]|v?i*))\)(?=\s[A-Z])' def __init__(self, text): self.text = text def add_line_break(self): self.format_alphabetical_lists() self.format_roman_numeral_lists() self.format_numbered_list_with_periods() self.format_numbered_list_with_parens() return self.text def replace_parens(self): text = re.sub(self.ROMAN_NUMERALS_IN_PARENTHESES, r'&✂&\1&⌬&', self.text) return text def format_numbered_list_with_parens(self): self.replace_parens_in_numbered_list() self.add_line_breaks_for_numbered_list_with_parens() self.text = Text(self.text).apply(self.ListMarkerRule) def replace_periods_in_numbered_list(self): self.scan_lists(self.NUMBERED_LIST_REGEX_1, self.NUMBERED_LIST_REGEX_2, '♨', strip=True) def format_numbered_list_with_periods(self): self.replace_periods_in_numbered_list() self.add_line_breaks_for_numbered_list_with_periods() self.text = Text(self.text).apply(self.SubstituteListPeriodRule) def format_alphabetical_lists(self): self.txt = self.add_line_breaks_for_alphabetical_list_with_periods( roman_numeral=False) self.txt = self.add_line_breaks_for_alphabetical_list_with_parens( roman_numeral=False) return self.txt def format_roman_numeral_lists(self): self.txt = self.add_line_breaks_for_alphabetical_list_with_periods( roman_numeral=True) self.txt = self.add_line_breaks_for_alphabetical_list_with_parens( roman_numeral=True) return self.txt def add_line_breaks_for_alphabetical_list_with_periods( self, roman_numeral=False): txt = self.iterate_alphabet_array( self.ALPHABETICAL_LIST_WITH_PERIODS, roman_numeral=roman_numeral) return txt def add_line_breaks_for_alphabetical_list_with_parens(self, roman_numeral=False): txt = self.iterate_alphabet_array( self.ALPHABETICAL_LIST_WITH_PARENS, parens=True, roman_numeral=roman_numeral) return txt def scan_lists(self, regex1, regex2, replacement, strip=False): list_array = re.findall(regex1, self.text) list_array = list(map(int, list_array)) for ind, item in enumerate(list_array): # to avoid IndexError # ruby returns nil if index is out of range if (ind < len(list_array) - 1 and item + 1 == list_array[ind + 1]): self.substitute_found_list_items(regex2, item, strip, replacement) elif ind > 0: if (((item - 1) == list_array[ind - 1]) or ((item == 0) and (list_array[ind - 1] == 9)) or ((item == 9) and (list_array[ind - 1] == 0))): self.substitute_found_list_items(regex2, item, strip, replacement) def substitute_found_list_items(self, regex, each, strip, replacement): def replace_item(match, val=None, strip=False, repl='♨'): match = match.group() if strip: match = str(match).strip() chomped_match = match if len(match) == 1 else match.strip('.])') if str(each) == chomped_match: return "{}{}".format(each, replacement) else: return str(match) self.text = re.sub(regex, partial(replace_item, val=each, strip=strip, repl=replacement), self.text) def add_line_breaks_for_numbered_list_with_periods(self): if ('♨' in self.text) and (not re.search( '♨.+(\n|\r).+♨', self.text)) and (not re.search( r'for\s\d{1,2}♨\s[a-z]', self.text)): self.text = Text(self.text).apply(self.SpaceBetweenListItemsFirstRule, self.SpaceBetweenListItemsSecondRule) def replace_parens_in_numbered_list(self): self.scan_lists( self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝') self.scan_lists(self.NUMBERED_LIST_PARENS_REGEX, self.NUMBERED_LIST_PARENS_REGEX, '☝') def add_line_breaks_for_numbered_list_with_parens(self): if '☝' in self.text and not re.search("☝.+\n.+☝|☝.+\r.+☝", self.text): self.text = Text(self.text).apply( self.SpaceBetweenListItemsThirdRule) def replace_alphabet_list(self, a): """ Input: 'a. ffegnog b. fgegkl c.' Output: \ra∯ ffegnog \rb∯ fgegkl \rc∯ """ def replace_letter_period(match, val=None): match = match.group() match_wo_period = match.strip('.') if match_wo_period == val: return '\r{}∯'.format(match_wo_period) else: return match txt = re.sub(self.ALPHABETICAL_LIST_LETTERS_AND_PERIODS_REGEX, partial(replace_letter_period, val=a), self.text, flags=re.IGNORECASE) return txt def replace_alphabet_list_parens(self, a): """ Input: "a) ffegnog (b) fgegkl c)" Output: "\ra) ffegnog \r&✂&b) fgegkl \rc)" """ def replace_alphabet_paren(match, val=None): match = match.group() if '(' in match: match_wo_paren = match.strip('(') if match_wo_paren == val: return '\r&✂&{}'.format(match_wo_paren) else: return match else: if match == val: return '\r{}'.format(match) else: return match # Make it cases-insensitive txt = re.sub(self.EXTRACT_ALPHABETICAL_LIST_LETTERS_REGEX, partial(replace_alphabet_paren, val=a), self.text, flags=re.IGNORECASE) return txt def replace_correct_alphabet_list(self, a, parens): if parens: a = self.replace_alphabet_list_parens(a) else: a = self.replace_alphabet_list(a) return a def last_array_item_replacement(self, a, i, alphabet, list_array, parens): if (len(alphabet) == 0) & (len(list_array) == 0) or ( list_array[i - 1] not in alphabet) or (a not in alphabet): return self.text if abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1: return self.text result = self.replace_correct_alphabet_list(a, parens) return result def other_items_replacement(self, a, i, alphabet, list_array, parens): if (len(alphabet) == 0) & (len(list_array) == 0) or ( list_array[i - 1] not in alphabet) or (a not in alphabet) or ( list_array[i + 1] not in alphabet): return self.text if alphabet.index(list_array[i + 1]) - alphabet.index(a) != 1 and \ abs(alphabet.index(list_array[i - 1]) - alphabet.index(a)) != 1: return self.text result = self.replace_correct_alphabet_list(a, parens) return result def iterate_alphabet_array(self, regex, parens=False, roman_numeral=False): list_array = re.findall(regex, self.text) alphabet = self.ROMAN_NUMERALS if roman_numeral else self.LATIN_NUMERALS list_array = [i for i in list_array if i in alphabet] for ind, each in enumerate(list_array): if ind == len(list_array) - 1: self.text = self.last_array_item_replacement(each, ind, alphabet, list_array, parens) else: self.text = self.other_items_replacement( each, ind, alphabet, list_array, parens) return self.text