def sentenize(s): """Sentenizes a string Args: s (string): string to sentenize Returns: string: sentenized string """ s = apply_regex_list(s, NORM_REGEX) offsets = [o for o in _boundary_gen(s, SPLIT_REGEX)] s = '\n'.join((s[o[0]:o[1]] for o in offsets)) s = apply_regex_list(s, REFINED_SPLIT_REGEX) s = apply_regex_list(s, SUBSENTENCE_REGEX) s = apply_regex_list(s, RECOMBINE_REGEX) s = apply_regex_list(s, SPLIT_ENUM_REGEX) return s
def correct(s): """Correct a string Args: s (string): string to correct Returns: string: corrected string """ s = apply_regex_list(s, CORRECTION_REGEX) return s
def sentenize_with_index(s): """Sentenizes a string but remember at what position a change is made Args: s (string): string to sentenize Returns: string, positions: corrected string """ indices = [] offsets = [o for o in _boundary_gen(s, SPLIT_REGEX)] s = '\n'.join((s[o[0]:o[1]] for o in offsets)) for r, t in REFINED_SPLIT_REGEX_CHANGE_LENGTH: regex_matches = r.finditer(s) for match in reversed(list(regex_matches)): indices.append(match.span(1)[1]) s = apply_regex_list(s, REFINED_SPLIT_REGEX_CHANGE_LENGTH) s = apply_regex_list(s, REFINED_SPLIT_REGEX_KEEP_LENGTH) s = apply_regex_list(s, SUBSENTENCE_REGEX) s = apply_regex_list(s, RECOMBINE_REGEX) s = apply_regex_list(s, SPLIT_ENUM_REGEX_KEEP_LENGTH) for r, t in SPLIT_ENUM_REGEX_CHANGE_LENGTH: regex_matches = r.finditer(s) for match in reversed(list(regex_matches)): indices.append(match.span(1)[1]) s = apply_regex_list(s, SPLIT_ENUM_REGEX_CHANGE_LENGTH) return s, indices
def test_string_normalization(): s = ' This is a test for string normalization in \n all cases. ' s = apply_regex_list(s, sentenize.NORM_REGEX) assert s == 'This is a test for string normalization in\nall cases.'
def test_split_enumerations(): s = 'Something quite annoying: (1) Enumerations are sometimes used as standalone sentences; (2) This is a case in which we want to split them of.' s = apply_regex_list(s, sentenize.SPLIT_ENUM_REGEX) assert s == 'Something quite annoying:\n(1) Enumerations are sometimes used as standalone sentences;\n(2) This is a case in which we want to split them of.'
def test_formtok_split(): s = 'Strings should be split after a formtok When the next sentence starts with a upper case letter.' s = apply_regex_list(s, sentenize.REFINED_SPLIT_REGEX) assert s == 'Strings should be split after a formtok\nWhen the next sentence starts with a upper case letter.'
def test_recombination(): s = 'Lets assume\nthere are splits.\nwhile there approx.\nshould be zero, e.\ng.\nBecause of abbreviations or Fig.\n5. As said by Test et al.\n[56].' s = apply_regex_list(s, sentenize.RECOMBINE_REGEX) assert s == 'Lets assume there are splits. while there approx. should be zero, e. g. Because of abbreviations or Fig. 5. As said by Test et al. [56].'
def test_subsentence_recognition(): s = 'There should be no splits (even with stuff like this.\nBut well..).' s = apply_regex_list(s, sentenize.SUBSENTENCE_REGEX) assert s == 'There should be no splits (even with stuff like this. But well..).'
def test_refined_split(): s = 'The refined split shoud do even more.For example find "errors".' s = apply_regex_list(s, sentenize.REFINED_SPLIT_REGEX) assert s == 'The refined split shoud do even more.\nFor example find "errors".'
def test_bracket_correction(): s = 'Testing errors(performed with brackets)in order to make sure they do not happen' s = apply_regex_list(s, corrections.CORRECTION_REGEX) assert s == 'Testing errors (performed with brackets) in order to make sure they do not happen'
def test_semi_colon_correction(): s = 'Errors with semi colons;they can happen;but are easy to correct.' s = apply_regex_list(s, corrections.CORRECTION_REGEX) assert s == 'Errors with semi colons; they can happen; but are easy to correct.'