示例#1
0
def sentenize(s):
    """Sentenizes a string

    Args:
        s (string): string to sentenize

    Returns:
        string: sentenized string
    """
    s = apply_regex_list(s, NORM_REGEX)
    offsets = [o for o in _boundary_gen(s, SPLIT_REGEX)]
    s = '\n'.join((s[o[0]:o[1]] for o in offsets))
    s = apply_regex_list(s, REFINED_SPLIT_REGEX)
    s = apply_regex_list(s, SUBSENTENCE_REGEX)
    s = apply_regex_list(s, RECOMBINE_REGEX)
    s = apply_regex_list(s, SPLIT_ENUM_REGEX)

    return s
示例#2
0
def correct(s):
    """Correct a string

    Args:
        s (string): string to correct

    Returns:
        string: corrected string
    """
    s = apply_regex_list(s, CORRECTION_REGEX)

    return s
示例#3
0
def sentenize_with_index(s):
    """Sentenizes a string but remember at what position a change is made

    Args:
        s (string): string to sentenize

    Returns:
        string, positions: corrected string
    """
    indices = []
    offsets = [o for o in _boundary_gen(s, SPLIT_REGEX)]
    s = '\n'.join((s[o[0]:o[1]] for o in offsets))
    for r, t in REFINED_SPLIT_REGEX_CHANGE_LENGTH:
        regex_matches = r.finditer(s)
        for match in reversed(list(regex_matches)):
            indices.append(match.span(1)[1])
    s = apply_regex_list(s, REFINED_SPLIT_REGEX_CHANGE_LENGTH)
    s = apply_regex_list(s, REFINED_SPLIT_REGEX_KEEP_LENGTH)
    s = apply_regex_list(s, SUBSENTENCE_REGEX)
    s = apply_regex_list(s, RECOMBINE_REGEX)
    s = apply_regex_list(s, SPLIT_ENUM_REGEX_KEEP_LENGTH)
    for r, t in SPLIT_ENUM_REGEX_CHANGE_LENGTH:
        regex_matches = r.finditer(s)
        for match in reversed(list(regex_matches)):
            indices.append(match.span(1)[1])
    s = apply_regex_list(s, SPLIT_ENUM_REGEX_CHANGE_LENGTH)
    return s, indices
示例#4
0
def test_string_normalization():
    s = '  This is    a test for  string normalization in    \n  all cases. '
    s = apply_regex_list(s, sentenize.NORM_REGEX)
    assert s == 'This is a test for string normalization in\nall cases.'
示例#5
0
def test_split_enumerations():
    s = 'Something quite annoying: (1) Enumerations are sometimes used as standalone sentences; (2) This is a case in which we want to split them of.'
    s = apply_regex_list(s, sentenize.SPLIT_ENUM_REGEX)
    assert s == 'Something quite annoying:\n(1) Enumerations are sometimes used as standalone sentences;\n(2) This is a case in which we want to split them of.'
示例#6
0
def test_formtok_split():
    s = 'Strings should be split after a formtok When the next sentence starts with a upper case letter.'
    s = apply_regex_list(s, sentenize.REFINED_SPLIT_REGEX)
    assert s == 'Strings should be split after a formtok\nWhen the next sentence starts with a upper case letter.'
示例#7
0
def test_recombination():
    s = 'Lets assume\nthere are splits.\nwhile there approx.\nshould be zero, e.\ng.\nBecause of abbreviations or Fig.\n5. As said by Test et al.\n[56].'
    s = apply_regex_list(s, sentenize.RECOMBINE_REGEX)
    assert s == 'Lets assume there are splits. while there approx. should be zero, e. g. Because of abbreviations or Fig. 5. As said by Test et al. [56].'
示例#8
0
def test_subsentence_recognition():
    s = 'There should be no splits (even with stuff like this.\nBut well..).'
    s = apply_regex_list(s, sentenize.SUBSENTENCE_REGEX)
    assert s == 'There should be no splits (even with stuff like this. But well..).'
示例#9
0
def test_refined_split():
    s = 'The refined split shoud do even more.For example find "errors".'
    s = apply_regex_list(s, sentenize.REFINED_SPLIT_REGEX)
    assert s == 'The refined split shoud do even more.\nFor example find "errors".'
示例#10
0
def test_bracket_correction():
    s = 'Testing errors(performed with brackets)in order to make sure they do not happen'
    s = apply_regex_list(s, corrections.CORRECTION_REGEX)
    assert s == 'Testing errors (performed with brackets) in order to make sure they do not happen'
示例#11
0
def test_semi_colon_correction():
    s = 'Errors with semi colons;they can happen;but are easy to correct.'
    s = apply_regex_list(s, corrections.CORRECTION_REGEX)
    assert s == 'Errors with semi colons; they can happen; but are easy to correct.'