예제 #1
0
파일: processor.py 프로젝트: shradhit/pySBD
 def split_into_segments(self):
     self.check_for_parens_between_quotes()
     sents = self.text.split('\r')
     # remove empty and none values
     sents = self.rm_none_flatten(sents)
     sents = [
         Text(s).apply(self.lang.SingleNewLineRule,
                       *self.lang.EllipsisRules.All) for s in sents
     ]
     sents = [self.check_for_punctuation(s) for s in sents]
     # flatten list of list of sentences
     sents = self.rm_none_flatten(sents)
     new_sents = []
     for sent in sents:
         sent = Text(sent).apply(*self.lang.SubSymbolsRules.All)
         post_process_sent = self.post_process_segments(sent)
         if post_process_sent and isinstance(post_process_sent, str):
             new_sents.append(post_process_sent)
         elif isinstance(post_process_sent, list):
             for pps in post_process_sent:
                 new_sents.append(pps)
     new_sents = [
         Text(ns).apply(self.lang.SubSingleQuoteRule) for ns in new_sents
     ]
     return new_sents
예제 #2
0
 def replace(self):
     self.text = Text(self.text).apply(Common.PossessiveAbbreviationRule,
                                       Common.KommanditgesellschaftRule,
                                       *SingleLetterAbbreviationRules.All)
     self.text = self.search_for_abbreviations_in_string()
     self.replace_multi_period_abbreviations()
     self.text = Text(self.text).apply(*AmPmRules.All)
     self.text = replace_abbreviation_as_sentence_boundary(self.text)
     return self.text
예제 #3
0
파일: processor.py 프로젝트: shradhit/pySBD
 def sentence_boundary_punctuation(self, txt):
     if hasattr(self.lang, 'ReplaceColonBetweenNumbersRule'):
         txt = Text(txt).apply(self.lang.ReplaceColonBetweenNumbersRule)
     if hasattr(self.lang, 'ReplaceNonSentenceBoundaryCommaRule'):
         txt = Text(txt).apply(
             self.lang.ReplaceNonSentenceBoundaryCommaRule)
     # retain exclamation mark if it is an ending character of a given text
     txt = re.sub(r'&ᓴ&$', '!', txt)
     txt = [
         m.group()
         for m in re.finditer(self.lang.SENTENCE_BOUNDARY_REGEX, txt)
     ]
     return txt
예제 #4
0
 def replace(self):
     self.text = Text(self.text).apply(
         self.lang.PossessiveAbbreviationRule,
         self.lang.KommanditgesellschaftRule,
         *self.lang.SingleLetterAbbreviationRules.All)
     abbr_handled_text = ""
     for line in self.text.splitlines(True):
         abbr_handled_text += self.search_for_abbreviations_in_string(line)
     self.text = abbr_handled_text
     self.replace_multi_period_abbreviations()
     self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
     self.text = self.replace_abbreviation_as_sentence_boundary()
     return self.text
예제 #5
0
def replace_punctuation(match, match_type=None):
    text = Text(match.group()).apply(*EscapeRegexReservedCharacters.All)
    sub = re.sub(r'\.', '∯', text)
    sub_1 = re.sub(r'\。', '&ᓰ&', sub)
    sub_2 = re.sub(r'\.', '&ᓱ&', sub_1)
    sub_3 = re.sub(r'\!', '&ᓳ&', sub_2)
    sub_4 = re.sub(r'\!', '&ᓴ&', sub_3)
    sub_5 = re.sub(r'\?', '&ᓷ&', sub_4)
    last_sub = re.sub(r'\?', '&ᓸ&', sub_5)
    if match_type != 'single':
        last_sub = re.sub(r"'", '&⎋&', last_sub)
    text = Text(last_sub).apply(*SubEscapedRegexReservedCharacters.All)
    return text
예제 #6
0
파일: processor.py 프로젝트: shradhit/pySBD
 def process_text(self, txt):
     if txt[-1] not in self.lang.Punctuations:
         txt += 'ȸ'
     txt = ExclamationWords.apply_rules(txt)
     txt = self.between_punctuation(txt)
     # handle text having only doublepunctuations
     if not re.match(self.lang.DoublePunctuationRules.DoublePunctuation,
                     txt):
         txt = Text(txt).apply(*self.lang.DoublePunctuationRules.All)
     txt = Text(txt).apply(self.lang.QuestionMarkInQuotationRule,
                           *self.lang.ExclamationPointRules.All)
     txt = ListItemReplacer(txt).replace_parens()
     txt = self.sentence_boundary_punctuation(txt)
     return txt
예제 #7
0
 def clean(self):
     if not self.text:
         return self.text
     self.remove_all_newlines()
     self.replace_double_newlines()
     self.replace_newlines()
     self.replace_escaped_newlines()
     self.text = Text(self.text).apply(*HTML.All)
     self.replace_punctuation_in_brackets()
     self.text = Text(self.text).apply(cr.InlineFormattingRule)
     self.clean_quotations()
     self.clean_table_of_contents()
     self.check_for_no_space_in_between_sentences()
     self.clean_consecutive_characters()
     return self.text
예제 #8
0
파일: processor.py 프로젝트: shradhit/pySBD
    def post_process_segments(self, txt):
        if len(txt) > 2 and re.search(r'\A[a-zA-Z]*\Z', txt):
            return txt

        # below condition present in pragmatic segmenter
        # dont know significance of it yet.
        # if self.consecutive_underscore(txt) or len(txt) < 2:
        #     return txt

        if re.match(r'\t', txt):
            pass

        # TODO:
        # Decide on keeping or removing Standard.ExtraWhiteSpaceRule
        # removed to retain original text spans
        # txt = Text(txt).apply(*ReinsertEllipsisRules.All,
        #                       Standard.ExtraWhiteSpaceRule)
        txt = Text(txt).apply(*self.lang.ReinsertEllipsisRules.All)
        if re.search(self.lang.QUOTATION_AT_END_OF_SENTENCE_REGEX, txt):
            txt = re.split(
                self.lang.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEX, txt)
            return txt
        else:
            txt = txt.replace('\n', '')
            return txt.strip()
예제 #9
0
 def replace_newlines(self):
     if self.doc_type == 'pdf':
         self.remove_pdf_line_breaks()
     else:
         self.text = Text(self.text).apply(
             cr.NewLineFollowedByPeriodRule,
             cr.ReplaceNewlineWithCarriageReturnRule)
예제 #10
0
 def clean_quotations(self):
     # method added explicitly
     # pragmatic-segmenter applies thhis method
     # at different location
     self.text = re.sub('`', "'", self.text)
     self.text = Text(self.text).apply(cr.QuotationsFirstRule,
                                       cr.QuotationsSecondRule)
예제 #11
0
 def add_line_breaks_for_numbered_list_with_periods(self):
     if ('♨' in self.text) and (not re.search(
             '♨.+(\n|\r).+♨', self.text)) and (not re.search(
                 r'for\s\d{1,2}♨\s[a-z]', self.text)):
         self.text = Text(self.text).apply(
             self.SpaceBetweenListItemsFirstRule,
             self.SpaceBetweenListItemsSecondRule)
예제 #12
0
 def search_for_connected_sentences(self, word, txt, regex, rule):
     if not re.search(regex, word):
         return txt
     if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
         return txt
     new_word = Text(word).apply(rule)
     txt = re.sub(re.escape(word), new_word, txt)
     return txt
예제 #13
0
        def replace(self):
            # Rubular: http://rubular.com/r/B4X33QKIL8
            SingleLowerCaseLetterRule = Rule(r'(?<=\s[a-z])\.(?=\s)', '∯')

            # Rubular: http://rubular.com/r/iUNSkCuso0
            SingleLowerCaseLetterAtStartOfLineRule = Rule(
                r'(?<=^[a-z])\.(?=\s)', '∯')
            self.text = Text(self.text).apply(
                self.lang.PossessiveAbbreviationRule,
                *self.lang.SingleLetterAbbreviationRules.All,
                SingleLowerCaseLetterRule,
                SingleLowerCaseLetterAtStartOfLineRule)

            self.text = self.search_for_abbreviations_in_string()
            self.replace_multi_period_abbreviations()
            self.text = Text(self.text).apply(*self.lang.AmPmRules.All)
            self.text = self.replace_abbreviation_as_sentence_boundary()
            return self.text
예제 #14
0
파일: cleaner.py 프로젝트: yushu-liu/pySBD
 def search_for_connected_sentences(self, word, txt, regex, rule):
     if not re.search(regex, word):
         return txt
     if any(k in word for k in cr.URL_EMAIL_KEYWORDS):
         return txt
     if any(a in word for a in Abbreviation.ABBREVIATIONS):
         return txt
     new_word = Text(word).apply(rule)
     txt = re.sub(word, new_word, txt)
     return txt
예제 #15
0
 def replace(self):
     SingleUpperCaseCyrillicLetterAtStartOfLineRule = Rule(
         r'(?<=^[А-ЯЁ])\.(?=\s)', '∯')
     SingleUpperCaseCyrillicLetterRule = Rule(r'(?<=\s[А-ЯЁ])\.(?=\s)',
                                              '∯')
     self.text = Text(self.text).apply(
         SingleUpperCaseCyrillicLetterAtStartOfLineRule,
         SingleUpperCaseCyrillicLetterRule)
     self.replace_multi_period_abbreviations()
     return self.text
예제 #16
0
        def between_punctuation(self, txt):
            txt = self.between_punctuation_processor(txt).replace()
            # Rubular: http://rubular.com/r/WRWy56Z5zp
            QuestionMarkFollowedByDashLowercaseRule = Rule(
                r'(?<=)\?(?=\s*[-—]\s*)', '&ᓷ&')
            # Rubular: http://rubular.com/r/lixxP7puSa
            ExclamationMarkFollowedByDashLowercaseRule = Rule(
                r'(?<=)!(?=\s*[-—]\s*)', '&ᓴ&')

            txt = Text(txt).apply(QuestionMarkFollowedByDashLowercaseRule,
                                  ExclamationMarkFollowedByDashLowercaseRule)
            return txt
예제 #17
0
파일: processor.py 프로젝트: shradhit/pySBD
 def process(self):
     if not self.text:
         return self.text
     li = ListItemReplacer(self.text)
     self.text = li.add_line_break()
     self.replace_abbreviations()
     self.replace_numbers()
     self.replace_continuous_punctuation()
     self.replace_periods_before_numeric_references()
     self.text = Text(self.text).apply(
         self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
         self.lang.GeoLocationRule, self.lang.FileFormatRule)
     postprocessed_sents = self.split_into_segments()
     return postprocessed_sents
예제 #18
0
 def process(self):
     if not self.text:
         return self.text
     self.doc = nlp(self.text)
     li = ListItemReplacer(self.text)
     self.text = li.add_line_break()
     self.replace_abbreviations()
     self.replace_numbers()
     self.replace_continuous_punctuation()
     self.replace_periods_before_numeric_references()
     self.text = Text(self.text).apply(
         self.lang.Abbreviation.WithMultiplePeriodsAndEmailRule,
         self.lang.GeoLocationRule, self.lang.FileFormatRule)
     processed = self.split_into_segments()
     if self.char_span:
         return self.sentences_with_char_spans(processed)
     else:
         return processed
예제 #19
0
 def format_numbered_list_with_parens(self):
     self.replace_parens_in_numbered_list()
     self.add_line_breaks_for_numbered_list_with_parens()
     self.text = Text(self.text).apply(self.ListMarkerRule)
예제 #20
0
 def replace_escaped_newlines(self):
     self.text = Text(self.text).apply(cr.EscapedNewLineRule,
                                       cr.EscapedCarriageReturnRule,
                                       cr.TypoEscapedNewLineRule,
                                       cr.TypoEscapedCarriageReturnRule)
예제 #21
0
 def format_numbered_list_with_periods(self):
     self.replace_periods_in_numbered_list()
     self.add_line_breaks_for_numbered_list_with_periods()
     self.text = Text(self.text).apply(self.SubstituteListPeriodRule)
예제 #22
0
 def remove_pdf_line_breaks(self):
     self.text = Text(self.text).apply(
         cr.NewLineFollowedByBulletRule, PDF.NewLineInMiddleOfSentenceRule,
         PDF.NewLineInMiddleOfSentenceNoSpacesRule)
예제 #23
0
 def replace_double_newlines(self):
     self.text = Text(self.text).apply(cr.DoubleNewLineWithSpaceRule,
                                       cr.DoubleNewLineRule)
예제 #24
0
 def remove_newline_in_middle_of_word(self):
     self.text = Text(self.text).apply(cr.NewLineInMiddleOfWordRule)
예제 #25
0
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
     self.replace_period_in_slovak_dates()
     self.replace_period_in_ordinal_numerals()
     self.replace_period_in_roman_numerals()
     return self.text
예제 #26
0
 def clean_consecutive_characters(self):
     self.text = Text(self.text).apply(cr.ConsecutivePeriodsRule,
                                       cr.ConsecutiveForwardSlashRule)
예제 #27
0
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
     self.replace_period_in_deutsch_dates()
     return self.text
예제 #28
0
 def clean_table_of_contents(self):
     self.text = Text(self.text).apply(cr.TableOfContentsRule,
                                       cr.ConsecutivePeriodsRule,
                                       cr.ConsecutiveForwardSlashRule)
예제 #29
0
파일: processor.py 프로젝트: shradhit/pySBD
 def replace_numbers(self):
     self.text = Text(self.text).apply(*self.lang.Numbers.All)
예제 #30
0
 def remove_newline_in_middle_of_word(self):
     NewLineInMiddleOfWordRule = Rule(r'(?<=の)\n(?=\S)', '')
     self.text = Text(self.text).apply(NewLineInMiddleOfWordRule)